diff --git a/sys/dev/cxgbe/cxgbei/cxgbei.c b/sys/dev/cxgbe/cxgbei/cxgbei.c index 4a8df99b3d48..c06e39005197 100644 --- a/sys/dev/cxgbe/cxgbei/cxgbei.c +++ b/sys/dev/cxgbe/cxgbei/cxgbei.c @@ -1,1209 +1,1232 @@ /*- * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * * Chelsio T5xx iSCSI driver * * Written by: Sreenivasa Honnur * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #ifdef TCP_OFFLOAD #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" /* for PCIE_MEM_ACCESS */ #include "tom/t4_tom.h" #include "cxgbei.h" static int worker_thread_count; -static struct cxgbei_worker_thread_softc *cwt_softc; -static struct proc *cxgbei_proc; +static struct cxgbei_worker_thread *cwt_rx_threads, *cwt_tx_threads; + +static void cwt_queue_for_rx(struct icl_cxgbei_conn *icc); static void read_pdu_limits(struct adapter *sc, uint32_t *max_tx_data_len, uint32_t *max_rx_data_len, struct ppod_region *pr) { uint32_t tx_len, rx_len, r, v; rx_len = t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE); tx_len = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE); r = t4_read_reg(sc, A_TP_PARA_REG2); rx_len = min(rx_len, G_MAXRXDATA(r)); tx_len = min(tx_len, G_MAXRXDATA(r)); r = t4_read_reg(sc, A_TP_PARA_REG7); v = min(G_PMMAXXFERLEN0(r), G_PMMAXXFERLEN1(r)); rx_len = min(rx_len, v); tx_len = min(tx_len, v); /* * AHS is not supported by the kernel so we'll not account for * it either in our PDU len -> data segment len conversions. */ rx_len -= ISCSI_BHS_SIZE + ISCSI_HEADER_DIGEST_SIZE + ISCSI_DATA_DIGEST_SIZE; tx_len -= ISCSI_BHS_SIZE + ISCSI_HEADER_DIGEST_SIZE + ISCSI_DATA_DIGEST_SIZE; /* * DDP can place only 4 pages for a single PDU. A single * request might use larger pages than the smallest page size, * but that cannot be guaranteed. Assume the smallest DDP * page size for this limit. */ rx_len = min(rx_len, 4 * (1U << pr->pr_page_shift[0])); if (chip_id(sc) == CHELSIO_T5) { tx_len = min(tx_len, 15360); rx_len = rounddown2(rx_len, 512); tx_len = rounddown2(tx_len, 512); } *max_tx_data_len = tx_len; *max_rx_data_len = rx_len; } /* * Initialize the software state of the iSCSI ULP driver. * * ENXIO means firmware didn't set up something that it was supposed to. */ static int cxgbei_init(struct adapter *sc, struct cxgbei_data *ci) { struct sysctl_oid *oid; struct sysctl_oid_list *children; struct ppod_region *pr; uint32_t r; int rc; MPASS(sc->vres.iscsi.size > 0); MPASS(ci != NULL); pr = &ci->pr; r = t4_read_reg(sc, A_ULP_RX_ISCSI_PSZ); rc = t4_init_ppod_region(pr, &sc->vres.iscsi, r, "iSCSI page pods"); if (rc != 0) { device_printf(sc->dev, "%s: failed to initialize the iSCSI page pod region: %u.\n", __func__, rc); return (rc); } r = t4_read_reg(sc, A_ULP_RX_ISCSI_TAGMASK); r &= V_ISCSITAGMASK(M_ISCSITAGMASK); if (r != pr->pr_tag_mask) { /* * Recent firmwares are supposed to set up the iSCSI tagmask * but we'll do it ourselves it the computed value doesn't match * what's in the register. */ device_printf(sc->dev, "tagmask 0x%08x does not match computed mask 0x%08x.\n", r, pr->pr_tag_mask); t4_set_reg_field(sc, A_ULP_RX_ISCSI_TAGMASK, V_ISCSITAGMASK(M_ISCSITAGMASK), pr->pr_tag_mask); } read_pdu_limits(sc, &ci->max_tx_data_len, &ci->max_rx_data_len, pr); sysctl_ctx_init(&ci->ctx); oid = device_get_sysctl_tree(sc->dev); /* dev.t5nex.X */ children = SYSCTL_CHILDREN(oid); oid = SYSCTL_ADD_NODE(&ci->ctx, children, OID_AUTO, "iscsi", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "iSCSI ULP settings"); children = SYSCTL_CHILDREN(oid); ci->ddp_threshold = 2048; SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "ddp_threshold", CTLFLAG_RW, &ci->ddp_threshold, 0, "Rx zero copy threshold"); SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "max_rx_data_len", CTLFLAG_RD, &ci->max_rx_data_len, 0, "Maximum receive data segment length"); SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "max_tx_data_len", CTLFLAG_RD, &ci->max_tx_data_len, 0, "Maximum transmit data segment length"); return (0); } static int do_rx_iscsi_hdr(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; struct cpl_iscsi_hdr *cpl = mtod(m, struct cpl_iscsi_hdr *); u_int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct icl_pdu *ip; struct icl_cxgbei_pdu *icp; uint16_t len_ddp = be16toh(cpl->pdu_len_ddp); uint16_t len = be16toh(cpl->len); M_ASSERTPKTHDR(m); MPASS(m->m_pkthdr.len == len + sizeof(*cpl)); ip = icl_cxgbei_new_pdu(M_NOWAIT); if (ip == NULL) CXGBE_UNIMPLEMENTED("PDU allocation failure"); m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs); ip->ip_data_len = G_ISCSI_PDU_LEN(len_ddp) - len; icp = ip_to_icp(ip); icp->icp_seq = ntohl(cpl->seq); icp->icp_flags = ICPF_RX_HDR; /* This is the start of a new PDU. There should be no old state. */ MPASS(toep->ulpcb2 == NULL); toep->ulpcb2 = icp; #if 0 CTR5(KTR_CXGBE, "%s: tid %u, cpl->len %u, pdu_len_ddp 0x%04x, icp %p", __func__, tid, len, len_ddp, icp); #endif m_freem(m); return (0); } static int do_rx_iscsi_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; struct cpl_iscsi_data *cpl = mtod(m, struct cpl_iscsi_data *); u_int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct icl_cxgbei_pdu *icp = toep->ulpcb2; struct icl_pdu *ip; M_ASSERTPKTHDR(m); MPASS(m->m_pkthdr.len == be16toh(cpl->len) + sizeof(*cpl)); if (icp == NULL) { /* * T6 completion enabled, start of a new pdu. Header * will come in completion CPL. */ ip = icl_cxgbei_new_pdu(M_NOWAIT); if (ip == NULL) CXGBE_UNIMPLEMENTED("PDU allocation failure"); icp = ip_to_icp(ip); } else { /* T5 mode, header is already received. */ MPASS(icp->icp_flags == ICPF_RX_HDR); MPASS(icp->ip.ip_data_mbuf == NULL); MPASS(icp->ip.ip_data_len == m->m_pkthdr.len - sizeof(*cpl)); } /* Trim the cpl header from mbuf. */ m_adj(m, sizeof(*cpl)); icp->icp_flags |= ICPF_RX_FLBUF; icp->ip.ip_data_mbuf = m; toep->ofld_rxq->rx_iscsi_fl_pdus++; toep->ofld_rxq->rx_iscsi_fl_octets += m->m_pkthdr.len; /* * For T6, save the icp for further processing in the * completion handler. */ if (icp->icp_flags == ICPF_RX_FLBUF) { MPASS(toep->ulpcb2 == NULL); toep->ulpcb2 = icp; } #if 0 CTR4(KTR_CXGBE, "%s: tid %u, cpl->len %u, icp %p", __func__, tid, be16toh(cpl->len), icp); #endif return (0); } static int mbuf_crc32c_helper(void *arg, void *data, u_int len) { uint32_t *digestp = arg; *digestp = calculate_crc32c(*digestp, data, len); return (0); } static struct icl_pdu * parse_pdu(struct socket *so, struct toepcb *toep, struct icl_cxgbei_conn *icc, struct sockbuf *sb, u_int total_len) { struct uio uio; struct iovec iov[2]; struct iscsi_bhs bhs; struct mbuf *m; struct icl_pdu *ip; u_int ahs_len, data_len, header_len, pdu_len; uint32_t calc_digest, wire_digest; int error; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = curthread; header_len = sizeof(struct iscsi_bhs); if (icc->ic.ic_header_crc32c) header_len += ISCSI_HEADER_DIGEST_SIZE; if (total_len < header_len) { ICL_WARN("truncated pre-offload PDU with len %u", total_len); return (NULL); } iov[0].iov_base = &bhs; iov[0].iov_len = sizeof(bhs); iov[1].iov_base = &wire_digest; iov[1].iov_len = sizeof(wire_digest); uio.uio_iov = iov; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_resid = header_len; error = soreceive(so, NULL, &uio, NULL, NULL, NULL); if (error != 0) { ICL_WARN("failed to read BHS from pre-offload PDU: %d", error); return (NULL); } ahs_len = bhs.bhs_total_ahs_len * 4; data_len = bhs.bhs_data_segment_len[0] << 16 | bhs.bhs_data_segment_len[1] << 8 | bhs.bhs_data_segment_len[2]; pdu_len = header_len + ahs_len + roundup2(data_len, 4); if (icc->ic.ic_data_crc32c && data_len != 0) pdu_len += ISCSI_DATA_DIGEST_SIZE; if (total_len < pdu_len) { ICL_WARN("truncated pre-offload PDU len %u vs %u", total_len, pdu_len); return (NULL); } if (ahs_len != 0) { ICL_WARN("received pre-offload PDU with AHS"); return (NULL); } if (icc->ic.ic_header_crc32c) { calc_digest = calculate_crc32c(0xffffffff, (caddr_t)&bhs, sizeof(bhs)); calc_digest ^= 0xffffffff; if (calc_digest != wire_digest) { ICL_WARN("received pre-offload PDU 0x%02x with " "invalid header digest (0x%x vs 0x%x)", bhs.bhs_opcode, wire_digest, calc_digest); toep->ofld_rxq->rx_iscsi_header_digest_errors++; return (NULL); } } m = NULL; if (data_len != 0) { uio.uio_iov = NULL; uio.uio_resid = roundup2(data_len, 4); if (icc->ic.ic_data_crc32c) uio.uio_resid += ISCSI_DATA_DIGEST_SIZE; error = soreceive(so, NULL, &uio, &m, NULL, NULL); if (error != 0) { ICL_WARN("failed to read data payload from " "pre-offload PDU: %d", error); return (NULL); } if (icc->ic.ic_data_crc32c) { m_copydata(m, roundup2(data_len, 4), sizeof(wire_digest), (caddr_t)&wire_digest); calc_digest = 0xffffffff; m_apply(m, 0, roundup2(data_len, 4), mbuf_crc32c_helper, &calc_digest); calc_digest ^= 0xffffffff; if (calc_digest != wire_digest) { ICL_WARN("received pre-offload PDU 0x%02x " "with invalid data digest (0x%x vs 0x%x)", bhs.bhs_opcode, wire_digest, calc_digest); toep->ofld_rxq->rx_iscsi_data_digest_errors++; m_freem(m); return (NULL); } } } ip = icl_cxgbei_new_pdu(M_WAITOK); icl_cxgbei_new_pdu_set_conn(ip, &icc->ic); *ip->ip_bhs = bhs; ip->ip_data_len = data_len; ip->ip_data_mbuf = m; return (ip); } static void parse_pdus(struct icl_cxgbei_conn *icc, struct sockbuf *sb) { struct icl_conn *ic = &icc->ic; struct socket *so = ic->ic_socket; struct toepcb *toep = icc->toep; struct icl_pdu *ip, *lastip; u_int total_len; SOCKBUF_LOCK_ASSERT(sb); CTR3(KTR_CXGBE, "%s: tid %u, %u bytes in so_rcv", __func__, toep->tid, sbused(sb)); lastip = NULL; while (sbused(sb) != 0 && (sb->sb_state & SBS_CANTRCVMORE) == 0) { total_len = sbused(sb); SOCKBUF_UNLOCK(sb); ip = parse_pdu(so, toep, icc, sb, total_len); if (ip == NULL) { ic->ic_error(ic); SOCKBUF_LOCK(sb); return; } if (lastip == NULL) STAILQ_INSERT_HEAD(&icc->rcvd_pdus, ip, ip_next); else STAILQ_INSERT_AFTER(&icc->rcvd_pdus, lastip, ip, ip_next); lastip = ip; SOCKBUF_LOCK(sb); } } static int do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1); u_int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp = toep->inp; struct socket *so; struct sockbuf *sb; struct tcpcb *tp; struct icl_cxgbei_conn *icc; struct icl_conn *ic; struct icl_cxgbei_pdu *icp = toep->ulpcb2; struct icl_pdu *ip; u_int pdu_len, val; struct epoch_tracker et; MPASS(m == NULL); /* Must already be assembling a PDU. */ MPASS(icp != NULL); MPASS(icp->icp_flags & ICPF_RX_HDR); /* Data is optional. */ MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0); pdu_len = be16toh(cpl->len); /* includes everything. */ val = be32toh(cpl->ddpvld); #if 0 CTR5(KTR_CXGBE, "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp_flags 0x%08x", __func__, tid, pdu_len, val, icp->icp_flags); #endif icp->icp_flags |= ICPF_RX_STATUS; ip = &icp->ip; if (val & F_DDP_PADDING_ERR) { ICL_WARN("received PDU 0x%02x with invalid padding", ip->ip_bhs->bhs_opcode); toep->ofld_rxq->rx_iscsi_padding_errors++; } if (val & F_DDP_HDRCRC_ERR) { ICL_WARN("received PDU 0x%02x with invalid header digest", ip->ip_bhs->bhs_opcode); toep->ofld_rxq->rx_iscsi_header_digest_errors++; } if (val & F_DDP_DATACRC_ERR) { ICL_WARN("received PDU 0x%02x with invalid data digest", ip->ip_bhs->bhs_opcode); toep->ofld_rxq->rx_iscsi_data_digest_errors++; } if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) { MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0); MPASS(ip->ip_data_len > 0); icp->icp_flags |= ICPF_RX_DDP; toep->ofld_rxq->rx_iscsi_ddp_pdus++; toep->ofld_rxq->rx_iscsi_ddp_octets += ip->ip_data_len; } INP_WLOCK(inp); if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) { CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", __func__, tid, pdu_len, inp->inp_flags); INP_WUNLOCK(inp); icl_cxgbei_conn_pdu_free(NULL, ip); toep->ulpcb2 = NULL; return (0); } /* * T6+ does not report data PDUs received via DDP without F * set. This can result in gaps in the TCP sequence space. */ tp = intotcpcb(inp); MPASS(chip_id(sc) >= CHELSIO_T6 || icp->icp_seq == tp->rcv_nxt); tp->rcv_nxt = icp->icp_seq + pdu_len; tp->t_rcvtime = ticks; /* * Don't update the window size or return credits since RX * flow control is disabled. */ so = inp->inp_socket; sb = &so->so_rcv; SOCKBUF_LOCK(sb); icc = toep->ulpcb; if (__predict_false(icc == NULL || sb->sb_state & SBS_CANTRCVMORE)) { CTR5(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x", __func__, tid, pdu_len, icc, sb->sb_state); SOCKBUF_UNLOCK(sb); INP_WUNLOCK(inp); CURVNET_SET(so->so_vnet); NET_EPOCH_ENTER(et); INP_WLOCK(inp); tp = tcp_drop(tp, ECONNRESET); if (tp) INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); icl_cxgbei_conn_pdu_free(NULL, ip); toep->ulpcb2 = NULL; return (0); } MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); ic = &icc->ic; if ((val & (F_DDP_PADDING_ERR | F_DDP_HDRCRC_ERR | F_DDP_DATACRC_ERR)) != 0) { SOCKBUF_UNLOCK(sb); INP_WUNLOCK(inp); icl_cxgbei_conn_pdu_free(NULL, ip); toep->ulpcb2 = NULL; ic->ic_error(ic); return (0); } icl_cxgbei_new_pdu_set_conn(ip, ic); STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next); - if ((icc->rx_flags & RXF_ACTIVE) == 0) { - struct cxgbei_worker_thread_softc *cwt = &cwt_softc[icc->cwt]; - - mtx_lock(&cwt->cwt_lock); - icc->rx_flags |= RXF_ACTIVE; - TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link); - if (cwt->cwt_state == CWT_SLEEPING) { - cwt->cwt_state = CWT_RUNNING; - cv_signal(&cwt->cwt_cv); - } - mtx_unlock(&cwt->cwt_lock); + if (!icc->rx_active) { + icc->rx_active = true; + cwt_queue_for_rx(icc); } SOCKBUF_UNLOCK(sb); INP_WUNLOCK(inp); toep->ulpcb2 = NULL; return (0); } static int do_rx_iscsi_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct epoch_tracker et; struct adapter *sc = iq->adapter; struct cpl_rx_iscsi_cmp *cpl = mtod(m, struct cpl_rx_iscsi_cmp *); u_int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct icl_cxgbei_pdu *icp = toep->ulpcb2; struct icl_pdu *ip; struct cxgbei_cmp *cmp; struct inpcb *inp = toep->inp; #ifdef INVARIANTS uint16_t len = be16toh(cpl->len); u_int data_digest_len; #endif struct socket *so; struct sockbuf *sb; struct tcpcb *tp; struct icl_cxgbei_conn *icc; struct icl_conn *ic; struct iscsi_bhs_data_out *bhsdo; u_int val = be32toh(cpl->ddpvld); u_int npdus, pdu_len; uint32_t prev_seg_len; M_ASSERTPKTHDR(m); MPASS(m->m_pkthdr.len == len + sizeof(*cpl)); if ((val & F_DDP_PDU) == 0) { MPASS(icp != NULL); MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0); ip = &icp->ip; } if (icp == NULL) { /* T6 completion enabled, start of a new PDU. */ ip = icl_cxgbei_new_pdu(M_NOWAIT); if (ip == NULL) CXGBE_UNIMPLEMENTED("PDU allocation failure"); icp = ip_to_icp(ip); } pdu_len = G_ISCSI_PDU_LEN(be16toh(cpl->pdu_len_ddp)); #if 0 CTR5(KTR_CXGBE, "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp %p", __func__, tid, pdu_len, val, icp); #endif /* Copy header */ m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs); bhsdo = (struct iscsi_bhs_data_out *)ip->ip_bhs; ip->ip_data_len = bhsdo->bhsdo_data_segment_len[0] << 16 | bhsdo->bhsdo_data_segment_len[1] << 8 | bhsdo->bhsdo_data_segment_len[2]; icp->icp_seq = ntohl(cpl->seq); icp->icp_flags |= ICPF_RX_HDR; icp->icp_flags |= ICPF_RX_STATUS; if (val & F_DDP_PADDING_ERR) { ICL_WARN("received PDU 0x%02x with invalid padding", ip->ip_bhs->bhs_opcode); toep->ofld_rxq->rx_iscsi_padding_errors++; } if (val & F_DDP_HDRCRC_ERR) { ICL_WARN("received PDU 0x%02x with invalid header digest", ip->ip_bhs->bhs_opcode); toep->ofld_rxq->rx_iscsi_header_digest_errors++; } if (val & F_DDP_DATACRC_ERR) { ICL_WARN("received PDU 0x%02x with invalid data digest", ip->ip_bhs->bhs_opcode); toep->ofld_rxq->rx_iscsi_data_digest_errors++; } INP_WLOCK(inp); if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) { CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", __func__, tid, pdu_len, inp->inp_flags); INP_WUNLOCK(inp); icl_cxgbei_conn_pdu_free(NULL, ip); toep->ulpcb2 = NULL; m_freem(m); return (0); } tp = intotcpcb(inp); /* * If icc is NULL, the connection is being closed in * icl_cxgbei_conn_close(), just drop this data. */ icc = toep->ulpcb; if (__predict_false(icc == NULL)) { CTR4(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes), icc %p", __func__, tid, pdu_len, icc); /* * Update rcv_nxt so the sequence number of the FIN * doesn't appear wrong. */ tp->rcv_nxt = icp->icp_seq + pdu_len; tp->t_rcvtime = ticks; INP_WUNLOCK(inp); icl_cxgbei_conn_pdu_free(NULL, ip); toep->ulpcb2 = NULL; m_freem(m); return (0); } MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); ic = &icc->ic; if ((val & (F_DDP_PADDING_ERR | F_DDP_HDRCRC_ERR | F_DDP_DATACRC_ERR)) != 0) { INP_WUNLOCK(inp); icl_cxgbei_conn_pdu_free(NULL, ip); toep->ulpcb2 = NULL; m_freem(m); ic->ic_error(ic); return (0); } #ifdef INVARIANTS data_digest_len = (icc->ulp_submode & ULP_CRC_DATA) ? ISCSI_DATA_DIGEST_SIZE : 0; MPASS(roundup2(ip->ip_data_len, 4) == pdu_len - len - data_digest_len); #endif if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) { MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0); MPASS(ip->ip_data_len > 0); icp->icp_flags |= ICPF_RX_DDP; bhsdo = (struct iscsi_bhs_data_out *)ip->ip_bhs; switch (ip->ip_bhs->bhs_opcode & ~ISCSI_BHS_OPCODE_IMMEDIATE) { case ISCSI_BHS_OPCODE_SCSI_DATA_IN: cmp = cxgbei_find_cmp(icc, be32toh(bhsdo->bhsdo_initiator_task_tag)); break; case ISCSI_BHS_OPCODE_SCSI_DATA_OUT: cmp = cxgbei_find_cmp(icc, be32toh(bhsdo->bhsdo_target_transfer_tag)); break; default: __assert_unreachable(); } MPASS(cmp != NULL); /* * The difference between the end of the last burst * and the offset of the last PDU in this burst is * the additional data received via DDP. */ prev_seg_len = be32toh(bhsdo->bhsdo_buffer_offset) - cmp->next_buffer_offset; if (prev_seg_len != 0) { uint32_t orig_datasn; /* * Return a "large" PDU representing the burst * of PDUs. Adjust the offset and length of * this PDU to represent the entire burst. */ ip->ip_data_len += prev_seg_len; bhsdo->bhsdo_data_segment_len[2] = ip->ip_data_len; bhsdo->bhsdo_data_segment_len[1] = ip->ip_data_len >> 8; bhsdo->bhsdo_data_segment_len[0] = ip->ip_data_len >> 16; bhsdo->bhsdo_buffer_offset = htobe32(cmp->next_buffer_offset); orig_datasn = htobe32(bhsdo->bhsdo_datasn); npdus = orig_datasn - cmp->last_datasn; bhsdo->bhsdo_datasn = htobe32(cmp->last_datasn + 1); cmp->last_datasn = orig_datasn; ip->ip_additional_pdus = npdus - 1; } else { MPASS(htobe32(bhsdo->bhsdo_datasn) == cmp->last_datasn + 1); npdus = 1; cmp->last_datasn = htobe32(bhsdo->bhsdo_datasn); } cmp->next_buffer_offset += ip->ip_data_len; toep->ofld_rxq->rx_iscsi_ddp_pdus += npdus; toep->ofld_rxq->rx_iscsi_ddp_octets += ip->ip_data_len; } else { MPASS(icp->icp_flags & (ICPF_RX_FLBUF)); MPASS(ip->ip_data_len == ip->ip_data_mbuf->m_pkthdr.len); } tp->rcv_nxt = icp->icp_seq + pdu_len; tp->t_rcvtime = ticks; /* * Don't update the window size or return credits since RX * flow control is disabled. */ so = inp->inp_socket; sb = &so->so_rcv; SOCKBUF_LOCK(sb); if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) { CTR5(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x", __func__, tid, pdu_len, icc, sb->sb_state); SOCKBUF_UNLOCK(sb); INP_WUNLOCK(inp); CURVNET_SET(so->so_vnet); NET_EPOCH_ENTER(et); INP_WLOCK(inp); tp = tcp_drop(tp, ECONNRESET); if (tp != NULL) INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); icl_cxgbei_conn_pdu_free(NULL, ip); toep->ulpcb2 = NULL; m_freem(m); return (0); } icl_cxgbei_new_pdu_set_conn(ip, ic); /* Enqueue the PDU to the received pdus queue. */ STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next); - if ((icc->rx_flags & RXF_ACTIVE) == 0) { - struct cxgbei_worker_thread_softc *cwt = &cwt_softc[icc->cwt]; - - mtx_lock(&cwt->cwt_lock); - icc->rx_flags |= RXF_ACTIVE; - TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link); - if (cwt->cwt_state == CWT_SLEEPING) { - cwt->cwt_state = CWT_RUNNING; - cv_signal(&cwt->cwt_cv); - } - mtx_unlock(&cwt->cwt_lock); + if (!icc->rx_active) { + icc->rx_active = true; + cwt_queue_for_rx(icc); } SOCKBUF_UNLOCK(sb); INP_WUNLOCK(inp); toep->ulpcb2 = NULL; m_freem(m); return (0); } static int cxgbei_activate(struct adapter *sc) { struct cxgbei_data *ci; int rc; ASSERT_SYNCHRONIZED_OP(sc); if (uld_active(sc, ULD_ISCSI)) { KASSERT(0, ("%s: iSCSI offload already enabled on adapter %p", __func__, sc)); return (0); } if (sc->iscsicaps == 0 || sc->vres.iscsi.size == 0) { device_printf(sc->dev, "not iSCSI offload capable, or capability disabled.\n"); return (ENOSYS); } /* per-adapter softc for iSCSI */ ci = malloc(sizeof(*ci), M_CXGBE, M_ZERO | M_WAITOK); if (ci == NULL) return (ENOMEM); rc = cxgbei_init(sc, ci); if (rc != 0) { free(ci, M_CXGBE); return (rc); } sc->iscsi_ulp_softc = ci; return (0); } static int cxgbei_deactivate(struct adapter *sc) { struct cxgbei_data *ci = sc->iscsi_ulp_softc; ASSERT_SYNCHRONIZED_OP(sc); if (ci != NULL) { sysctl_ctx_free(&ci->ctx); t4_free_ppod_region(&ci->pr); free(ci, M_CXGBE); sc->iscsi_ulp_softc = NULL; } return (0); } static void cxgbei_activate_all(struct adapter *sc, void *arg __unused) { if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isact") != 0) return; /* Activate iSCSI if any port on this adapter has IFCAP_TOE enabled. */ if (sc->offload_map && !uld_active(sc, ULD_ISCSI)) (void) t4_activate_uld(sc, ULD_ISCSI); end_synchronized_op(sc, 0); } static void cxgbei_deactivate_all(struct adapter *sc, void *arg __unused) { if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isdea") != 0) return; if (uld_active(sc, ULD_ISCSI)) (void) t4_deactivate_uld(sc, ULD_ISCSI); end_synchronized_op(sc, 0); } static struct uld_info cxgbei_uld_info = { .uld_id = ULD_ISCSI, .activate = cxgbei_activate, .deactivate = cxgbei_deactivate, }; static void -cwt_main(void *arg) +cwt_rx_main(void *arg) { - struct cxgbei_worker_thread_softc *cwt = arg; + struct cxgbei_worker_thread *cwt = arg; struct icl_cxgbei_conn *icc = NULL; struct icl_conn *ic; struct icl_pdu *ip; struct sockbuf *sb; STAILQ_HEAD(, icl_pdu) rx_pdus = STAILQ_HEAD_INITIALIZER(rx_pdus); MPASS(cwt != NULL); mtx_lock(&cwt->cwt_lock); MPASS(cwt->cwt_state == 0); cwt->cwt_state = CWT_RUNNING; cv_signal(&cwt->cwt_cv); while (__predict_true(cwt->cwt_state != CWT_STOP)) { cwt->cwt_state = CWT_RUNNING; - while ((icc = TAILQ_FIRST(&cwt->rx_head)) != NULL) { - TAILQ_REMOVE(&cwt->rx_head, icc, rx_link); + while ((icc = TAILQ_FIRST(&cwt->icc_head)) != NULL) { + TAILQ_REMOVE(&cwt->icc_head, icc, rx_link); mtx_unlock(&cwt->cwt_lock); ic = &icc->ic; sb = &ic->ic_socket->so_rcv; SOCKBUF_LOCK(sb); if (__predict_false(sbused(sb)) != 0) { /* * PDUs were received before the tid * transitioned to ULP mode. Convert * them to icl_cxgbei_pdus and insert * them into the head of rcvd_pdus. */ parse_pdus(icc, sb); } - MPASS(icc->rx_flags & RXF_ACTIVE); + MPASS(icc->rx_active); if (__predict_true(!(sb->sb_state & SBS_CANTRCVMORE))) { MPASS(STAILQ_EMPTY(&rx_pdus)); STAILQ_SWAP(&icc->rcvd_pdus, &rx_pdus, icl_pdu); SOCKBUF_UNLOCK(sb); /* Hand over PDUs to ICL. */ while ((ip = STAILQ_FIRST(&rx_pdus)) != NULL) { STAILQ_REMOVE_HEAD(&rx_pdus, ip_next); ic->ic_receive(ip); } SOCKBUF_LOCK(sb); MPASS(STAILQ_EMPTY(&rx_pdus)); } - MPASS(icc->rx_flags & RXF_ACTIVE); + MPASS(icc->rx_active); if (STAILQ_EMPTY(&icc->rcvd_pdus) || __predict_false(sb->sb_state & SBS_CANTRCVMORE)) { - icc->rx_flags &= ~RXF_ACTIVE; + icc->rx_active = false; + SOCKBUF_UNLOCK(sb); + + mtx_lock(&cwt->cwt_lock); } else { + SOCKBUF_UNLOCK(sb); + /* * More PDUs were received while we were busy * handing over the previous batch to ICL. * Re-add this connection to the end of the * queue. */ mtx_lock(&cwt->cwt_lock); - TAILQ_INSERT_TAIL(&cwt->rx_head, icc, + TAILQ_INSERT_TAIL(&cwt->icc_head, icc, rx_link); - mtx_unlock(&cwt->cwt_lock); } - SOCKBUF_UNLOCK(sb); - - mtx_lock(&cwt->cwt_lock); } /* Inner loop doesn't check for CWT_STOP, do that first. */ if (__predict_false(cwt->cwt_state == CWT_STOP)) break; cwt->cwt_state = CWT_SLEEPING; cv_wait(&cwt->cwt_cv, &cwt->cwt_lock); } - MPASS(TAILQ_FIRST(&cwt->rx_head) == NULL); - mtx_assert(&cwt->cwt_lock, MA_OWNED); - cwt->cwt_state = CWT_STOPPED; - cv_signal(&cwt->cwt_cv); + MPASS(TAILQ_FIRST(&cwt->icc_head) == NULL); mtx_unlock(&cwt->cwt_lock); kthread_exit(); } +static void +cwt_queue_for_rx(struct icl_cxgbei_conn *icc) +{ + struct cxgbei_worker_thread *cwt = &cwt_rx_threads[icc->cwt]; + + mtx_lock(&cwt->cwt_lock); + TAILQ_INSERT_TAIL(&cwt->icc_head, icc, rx_link); + if (cwt->cwt_state == CWT_SLEEPING) { + cwt->cwt_state = CWT_RUNNING; + cv_signal(&cwt->cwt_cv); + } + mtx_unlock(&cwt->cwt_lock); +} + +void +cwt_queue_for_tx(struct icl_cxgbei_conn *icc) +{ + struct cxgbei_worker_thread *cwt = &cwt_tx_threads[icc->cwt]; + + mtx_lock(&cwt->cwt_lock); + TAILQ_INSERT_TAIL(&cwt->icc_head, icc, tx_link); + if (cwt->cwt_state == CWT_SLEEPING) { + cwt->cwt_state = CWT_RUNNING; + cv_signal(&cwt->cwt_cv); + } + mtx_unlock(&cwt->cwt_lock); +} + static int start_worker_threads(void) { + struct proc *cxgbei_proc; int i, rc; - struct cxgbei_worker_thread_softc *cwt; + struct cxgbei_worker_thread *cwt; worker_thread_count = min(mp_ncpus, 32); - cwt_softc = malloc(worker_thread_count * sizeof(*cwt), M_CXGBE, + cwt_rx_threads = malloc(worker_thread_count * sizeof(*cwt), M_CXGBE, + M_WAITOK | M_ZERO); + cwt_tx_threads = malloc(worker_thread_count * sizeof(*cwt), M_CXGBE, M_WAITOK | M_ZERO); - MPASS(cxgbei_proc == NULL); - for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) { + for (i = 0, cwt = &cwt_rx_threads[0]; i < worker_thread_count; + i++, cwt++) { + mtx_init(&cwt->cwt_lock, "cwt lock", NULL, MTX_DEF); + cv_init(&cwt->cwt_cv, "cwt cv"); + TAILQ_INIT(&cwt->icc_head); + } + + for (i = 0, cwt = &cwt_tx_threads[0]; i < worker_thread_count; + i++, cwt++) { mtx_init(&cwt->cwt_lock, "cwt lock", NULL, MTX_DEF); cv_init(&cwt->cwt_cv, "cwt cv"); - TAILQ_INIT(&cwt->rx_head); - rc = kproc_kthread_add(cwt_main, cwt, &cxgbei_proc, NULL, 0, 0, - "cxgbei", "%d", i); + TAILQ_INIT(&cwt->icc_head); + } + + cxgbei_proc = NULL; + for (i = 0, cwt = &cwt_rx_threads[0]; i < worker_thread_count; + i++, cwt++) { + rc = kproc_kthread_add(cwt_rx_main, cwt, &cxgbei_proc, + &cwt->cwt_td, 0, 0, "cxgbei", "rx %d", i); if (rc != 0) { - printf("cxgbei: failed to start thread #%d/%d (%d)\n", + printf("cxgbei: failed to start rx thread #%d/%d (%d)\n", i + 1, worker_thread_count, rc); - mtx_destroy(&cwt->cwt_lock); - cv_destroy(&cwt->cwt_cv); - bzero(cwt, sizeof(*cwt)); - if (i == 0) { - free(cwt_softc, M_CXGBE); - worker_thread_count = 0; - - return (rc); - } - - /* Not fatal, carry on with fewer threads. */ - worker_thread_count = i; - rc = 0; - break; + return (rc); } + } - /* Wait for thread to start before moving on to the next one. */ - mtx_lock(&cwt->cwt_lock); - while (cwt->cwt_state == 0) - cv_wait(&cwt->cwt_cv, &cwt->cwt_lock); - mtx_unlock(&cwt->cwt_lock); + for (i = 0, cwt = &cwt_tx_threads[0]; i < worker_thread_count; + i++, cwt++) { + rc = kproc_kthread_add(cwt_tx_main, cwt, &cxgbei_proc, + &cwt->cwt_td, 0, 0, "cxgbei", "tx %d", i); + if (rc != 0) { + printf("cxgbei: failed to start tx thread #%d/%d (%d)\n", + i + 1, worker_thread_count, rc); + return (rc); + } } - MPASS(cwt_softc != NULL); - MPASS(worker_thread_count > 0); return (0); } static void -stop_worker_threads(void) +stop_worker_threads1(struct cxgbei_worker_thread *threads) { + struct cxgbei_worker_thread *cwt; int i; - struct cxgbei_worker_thread_softc *cwt = &cwt_softc[0]; - MPASS(worker_thread_count >= 0); - - for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) { + for (i = 0, cwt = &threads[0]; i < worker_thread_count; i++, cwt++) { mtx_lock(&cwt->cwt_lock); - MPASS(cwt->cwt_state == CWT_RUNNING || - cwt->cwt_state == CWT_SLEEPING); - cwt->cwt_state = CWT_STOP; - cv_signal(&cwt->cwt_cv); - do { - cv_wait(&cwt->cwt_cv, &cwt->cwt_lock); - } while (cwt->cwt_state != CWT_STOPPED); + if (cwt->cwt_td != NULL) { + MPASS(cwt->cwt_state == CWT_RUNNING || + cwt->cwt_state == CWT_SLEEPING); + cwt->cwt_state = CWT_STOP; + cv_signal(&cwt->cwt_cv); + mtx_sleep(cwt->cwt_td, &cwt->cwt_lock, 0, "cwtstop", 0); + } mtx_unlock(&cwt->cwt_lock); mtx_destroy(&cwt->cwt_lock); cv_destroy(&cwt->cwt_cv); } - free(cwt_softc, M_CXGBE); + free(threads, M_CXGBE); +} + +static void +stop_worker_threads(void) +{ + + MPASS(worker_thread_count >= 0); + stop_worker_threads1(cwt_rx_threads); + stop_worker_threads1(cwt_tx_threads); } /* Select a worker thread for a connection. */ u_int cxgbei_select_worker_thread(struct icl_cxgbei_conn *icc) { struct adapter *sc = icc->sc; struct toepcb *toep = icc->toep; u_int i, n; n = worker_thread_count / sc->sge.nofldrxq; if (n > 0) i = toep->vi->pi->port_id * n + arc4random() % n; else i = arc4random() % worker_thread_count; CTR3(KTR_CXGBE, "%s: tid %u, cwt %u", __func__, toep->tid, i); return (i); } static int cxgbei_mod_load(void) { int rc; t4_register_cpl_handler(CPL_ISCSI_HDR, do_rx_iscsi_hdr); t4_register_cpl_handler(CPL_ISCSI_DATA, do_rx_iscsi_data); t4_register_cpl_handler(CPL_RX_ISCSI_DDP, do_rx_iscsi_ddp); t4_register_cpl_handler(CPL_RX_ISCSI_CMP, do_rx_iscsi_cmp); rc = start_worker_threads(); if (rc != 0) return (rc); rc = t4_register_uld(&cxgbei_uld_info); if (rc != 0) { stop_worker_threads(); return (rc); } t4_iterate(cxgbei_activate_all, NULL); return (rc); } static int cxgbei_mod_unload(void) { t4_iterate(cxgbei_deactivate_all, NULL); if (t4_unregister_uld(&cxgbei_uld_info) == EBUSY) return (EBUSY); stop_worker_threads(); t4_register_cpl_handler(CPL_ISCSI_HDR, NULL); t4_register_cpl_handler(CPL_ISCSI_DATA, NULL); t4_register_cpl_handler(CPL_RX_ISCSI_DDP, NULL); t4_register_cpl_handler(CPL_RX_ISCSI_CMP, NULL); return (0); } #endif static int cxgbei_modevent(module_t mod, int cmd, void *arg) { int rc = 0; #ifdef TCP_OFFLOAD switch (cmd) { case MOD_LOAD: rc = cxgbei_mod_load(); if (rc == 0) rc = icl_cxgbei_mod_load(); break; case MOD_UNLOAD: rc = icl_cxgbei_mod_unload(); if (rc == 0) rc = cxgbei_mod_unload(); break; default: rc = EINVAL; } #else printf("cxgbei: compiled without TCP_OFFLOAD support.\n"); rc = EOPNOTSUPP; #endif return (rc); } static moduledata_t cxgbei_mod = { "cxgbei", cxgbei_modevent, NULL, }; MODULE_VERSION(cxgbei, 1); DECLARE_MODULE(cxgbei, cxgbei_mod, SI_SUB_EXEC, SI_ORDER_ANY); MODULE_DEPEND(cxgbei, t4_tom, 1, 1, 1); MODULE_DEPEND(cxgbei, cxgbe, 1, 1, 1); MODULE_DEPEND(cxgbei, icl, 1, 1, 1); diff --git a/sys/dev/cxgbe/cxgbei/cxgbei.h b/sys/dev/cxgbe/cxgbei/cxgbei.h index 58a5dac6d63b..b078f3110d62 100644 --- a/sys/dev/cxgbe/cxgbei/cxgbei.h +++ b/sys/dev/cxgbe/cxgbei/cxgbei.h @@ -1,146 +1,149 @@ /*- * Copyright (c) 2012, 2015 Chelsio Communications, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ * */ #ifndef __CXGBEI_OFLD_H__ #define __CXGBEI_OFLD_H__ #include enum { CWT_SLEEPING = 1, CWT_RUNNING = 2, CWT_STOP = 3, - CWT_STOPPED = 4, }; -struct cxgbei_worker_thread_softc { +struct cxgbei_worker_thread { struct mtx cwt_lock; struct cv cwt_cv; volatile int cwt_state; + struct thread *cwt_td; - TAILQ_HEAD(, icl_cxgbei_conn) rx_head; + TAILQ_HEAD(, icl_cxgbei_conn) icc_head; } __aligned(CACHE_LINE_SIZE); #define CXGBEI_CONN_SIGNATURE 0x56788765 -enum { - RXF_ACTIVE = 1 << 0, /* In the worker thread's queue */ -}; - struct cxgbei_cmp { LIST_ENTRY(cxgbei_cmp) link; uint32_t tt; /* Transfer tag. */ uint32_t next_buffer_offset; uint32_t last_datasn; }; LIST_HEAD(cxgbei_cmp_head, cxgbei_cmp); struct icl_cxgbei_conn { struct icl_conn ic; /* cxgbei specific stuff goes here. */ uint32_t icc_signature; int ulp_submode; struct adapter *sc; struct toepcb *toep; + u_int cwt; /* Receive related. */ - u_int rx_flags; /* protected by so_rcv lock */ - u_int cwt; + bool rx_active; /* protected by so_rcv lock */ STAILQ_HEAD(, icl_pdu) rcvd_pdus; /* protected by so_rcv lock */ TAILQ_ENTRY(icl_cxgbei_conn) rx_link; /* protected by cwt lock */ struct cxgbei_cmp_head *cmp_table; /* protected by cmp_lock */ struct mtx cmp_lock; unsigned long cmp_hash_mask; + + /* Transmit related. */ + bool tx_active; /* protected by ic lock */ + STAILQ_HEAD(, icl_pdu) sent_pdus; /* protected by ic lock */ + TAILQ_ENTRY(icl_cxgbei_conn) tx_link; /* protected by cwt lock */ }; static inline struct icl_cxgbei_conn * ic_to_icc(struct icl_conn *ic) { return (__containerof(ic, struct icl_cxgbei_conn, ic)); } /* PDU flags and signature. */ enum { ICPF_RX_HDR = 1 << 0, /* PDU header received. */ ICPF_RX_FLBUF = 1 << 1, /* PDU payload received in a freelist. */ ICPF_RX_DDP = 1 << 2, /* PDU payload DDP'd. */ ICPF_RX_STATUS = 1 << 3, /* Rx status received. */ CXGBEI_PDU_SIGNATURE = 0x12344321 }; struct icl_cxgbei_pdu { struct icl_pdu ip; /* cxgbei specific stuff goes here. */ uint32_t icp_signature; uint32_t icp_seq; /* For debug only */ u_int icp_flags; u_int ref_cnt; icl_pdu_cb cb; int error; }; static inline struct icl_cxgbei_pdu * ip_to_icp(struct icl_pdu *ip) { return (__containerof(ip, struct icl_cxgbei_pdu, ip)); } struct cxgbei_data { u_int max_tx_data_len; u_int max_rx_data_len; u_int ddp_threshold; struct ppod_region pr; struct sysctl_ctx_list ctx; /* from uld_activate to deactivate */ }; #define CXGBEI_MAX_ISO_PAYLOAD 65535 /* cxgbei.c */ u_int cxgbei_select_worker_thread(struct icl_cxgbei_conn *); +void cwt_queue_for_tx(struct icl_cxgbei_conn *); /* icl_cxgbei.c */ +void cwt_tx_main(void *); int icl_cxgbei_mod_load(void); int icl_cxgbei_mod_unload(void); struct icl_pdu *icl_cxgbei_new_pdu(int); void icl_cxgbei_new_pdu_set_conn(struct icl_pdu *, struct icl_conn *); void icl_cxgbei_conn_pdu_free(struct icl_conn *, struct icl_pdu *); struct cxgbei_cmp *cxgbei_find_cmp(struct icl_cxgbei_conn *, uint32_t); #endif diff --git a/sys/dev/cxgbe/cxgbei/icl_cxgbei.c b/sys/dev/cxgbe/cxgbei/icl_cxgbei.c index 04dc3a68e3b8..10d5430c6413 100644 --- a/sys/dev/cxgbe/cxgbei/icl_cxgbei.c +++ b/sys/dev/cxgbe/cxgbei/icl_cxgbei.c @@ -1,1457 +1,1571 @@ /*- * Copyright (c) 2012 The FreeBSD Foundation * Copyright (c) 2015 Chelsio Communications, Inc. * All rights reserved. * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * cxgbei implementation of iSCSI Common Layer kobj(9) interface. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #ifdef TCP_OFFLOAD #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common/common.h" #include "common/t4_regs.h" #include "common/t4_tcb.h" #include "tom/t4_tom.h" #include "cxgbei.h" /* * Use the page pod tag for the TT hash. */ #define TT_HASH(icc, tt) (G_PPOD_TAG(tt) & (icc)->cmp_hash_mask) struct cxgbei_ddp_state { struct ppod_reservation prsv; struct cxgbei_cmp cmp; }; static MALLOC_DEFINE(M_CXGBEI, "cxgbei", "cxgbei(4)"); SYSCTL_NODE(_kern_icl, OID_AUTO, cxgbei, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "Chelsio iSCSI offload"); static int first_burst_length = 8192; SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, first_burst_length, CTLFLAG_RWTUN, &first_burst_length, 0, "First burst length"); static int max_burst_length = 2 * 1024 * 1024; SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, max_burst_length, CTLFLAG_RWTUN, &max_burst_length, 0, "Maximum burst length"); static int sendspace = 1048576; SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, sendspace, CTLFLAG_RWTUN, &sendspace, 0, "Default send socket buffer size"); static int recvspace = 1048576; SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, recvspace, CTLFLAG_RWTUN, &recvspace, 0, "Default receive socket buffer size"); static volatile u_int icl_cxgbei_ncons; static icl_conn_new_pdu_t icl_cxgbei_conn_new_pdu; static icl_conn_pdu_data_segment_length_t icl_cxgbei_conn_pdu_data_segment_length; static icl_conn_pdu_append_data_t icl_cxgbei_conn_pdu_append_data; static icl_conn_pdu_get_data_t icl_cxgbei_conn_pdu_get_data; static icl_conn_pdu_queue_t icl_cxgbei_conn_pdu_queue; static icl_conn_pdu_queue_cb_t icl_cxgbei_conn_pdu_queue_cb; static icl_conn_handoff_t icl_cxgbei_conn_handoff; static icl_conn_free_t icl_cxgbei_conn_free; static icl_conn_close_t icl_cxgbei_conn_close; static icl_conn_task_setup_t icl_cxgbei_conn_task_setup; static icl_conn_task_done_t icl_cxgbei_conn_task_done; static icl_conn_transfer_setup_t icl_cxgbei_conn_transfer_setup; static icl_conn_transfer_done_t icl_cxgbei_conn_transfer_done; static kobj_method_t icl_cxgbei_methods[] = { KOBJMETHOD(icl_conn_new_pdu, icl_cxgbei_conn_new_pdu), KOBJMETHOD(icl_conn_pdu_free, icl_cxgbei_conn_pdu_free), KOBJMETHOD(icl_conn_pdu_data_segment_length, icl_cxgbei_conn_pdu_data_segment_length), KOBJMETHOD(icl_conn_pdu_append_data, icl_cxgbei_conn_pdu_append_data), KOBJMETHOD(icl_conn_pdu_get_data, icl_cxgbei_conn_pdu_get_data), KOBJMETHOD(icl_conn_pdu_queue, icl_cxgbei_conn_pdu_queue), KOBJMETHOD(icl_conn_pdu_queue_cb, icl_cxgbei_conn_pdu_queue_cb), KOBJMETHOD(icl_conn_handoff, icl_cxgbei_conn_handoff), KOBJMETHOD(icl_conn_free, icl_cxgbei_conn_free), KOBJMETHOD(icl_conn_close, icl_cxgbei_conn_close), KOBJMETHOD(icl_conn_task_setup, icl_cxgbei_conn_task_setup), KOBJMETHOD(icl_conn_task_done, icl_cxgbei_conn_task_done), KOBJMETHOD(icl_conn_transfer_setup, icl_cxgbei_conn_transfer_setup), KOBJMETHOD(icl_conn_transfer_done, icl_cxgbei_conn_transfer_done), { 0, 0 } }; DEFINE_CLASS(icl_cxgbei, icl_cxgbei_methods, sizeof(struct icl_cxgbei_conn)); void icl_cxgbei_conn_pdu_free(struct icl_conn *ic, struct icl_pdu *ip) { struct icl_cxgbei_pdu *icp = ip_to_icp(ip); KASSERT(icp->ref_cnt != 0, ("freeing deleted PDU")); MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE); MPASS(ic == ip->ip_conn); m_freem(ip->ip_ahs_mbuf); m_freem(ip->ip_data_mbuf); m_freem(ip->ip_bhs_mbuf); KASSERT(ic != NULL || icp->ref_cnt == 1, ("orphaned PDU has oustanding references")); if (atomic_fetchadd_int(&icp->ref_cnt, -1) != 1) return; free(icp, M_CXGBEI); #ifdef DIAGNOSTIC if (__predict_true(ic != NULL)) refcount_release(&ic->ic_outstanding_pdus); #endif } static void icl_cxgbei_pdu_call_cb(struct icl_pdu *ip) { struct icl_cxgbei_pdu *icp = ip_to_icp(ip); MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE); if (icp->cb != NULL) icp->cb(ip, icp->error); #ifdef DIAGNOSTIC if (__predict_true(ip->ip_conn != NULL)) refcount_release(&ip->ip_conn->ic_outstanding_pdus); #endif free(icp, M_CXGBEI); } static void icl_cxgbei_pdu_done(struct icl_pdu *ip, int error) { struct icl_cxgbei_pdu *icp = ip_to_icp(ip); if (error != 0) icp->error = error; m_freem(ip->ip_ahs_mbuf); ip->ip_ahs_mbuf = NULL; m_freem(ip->ip_data_mbuf); ip->ip_data_mbuf = NULL; m_freem(ip->ip_bhs_mbuf); ip->ip_bhs_mbuf = NULL; /* * All other references to this PDU should have been dropped * by the m_freem() of ip_data_mbuf. */ if (atomic_fetchadd_int(&icp->ref_cnt, -1) == 1) icl_cxgbei_pdu_call_cb(ip); else __assert_unreachable(); } static void icl_cxgbei_mbuf_done(struct mbuf *mb) { struct icl_cxgbei_pdu *icp = (struct icl_cxgbei_pdu *)mb->m_ext.ext_arg1; /* * NB: mb_free_mext() might leave ref_cnt as 1 without * decrementing it if it hits the fast path in the ref_cnt * check. */ icl_cxgbei_pdu_call_cb(&icp->ip); } struct icl_pdu * icl_cxgbei_new_pdu(int flags) { struct icl_cxgbei_pdu *icp; struct icl_pdu *ip; struct mbuf *m; icp = malloc(sizeof(*icp), M_CXGBEI, flags | M_ZERO); if (__predict_false(icp == NULL)) return (NULL); icp->icp_signature = CXGBEI_PDU_SIGNATURE; icp->ref_cnt = 1; ip = &icp->ip; m = m_gethdr(flags, MT_DATA); if (__predict_false(m == NULL)) { free(icp, M_CXGBEI); return (NULL); } ip->ip_bhs_mbuf = m; ip->ip_bhs = mtod(m, struct iscsi_bhs *); memset(ip->ip_bhs, 0, sizeof(*ip->ip_bhs)); m->m_len = sizeof(struct iscsi_bhs); m->m_pkthdr.len = m->m_len; return (ip); } void icl_cxgbei_new_pdu_set_conn(struct icl_pdu *ip, struct icl_conn *ic) { ip->ip_conn = ic; #ifdef DIAGNOSTIC refcount_acquire(&ic->ic_outstanding_pdus); #endif } /* * Allocate icl_pdu with empty BHS to fill up by the caller. */ static struct icl_pdu * icl_cxgbei_conn_new_pdu(struct icl_conn *ic, int flags) { struct icl_pdu *ip; ip = icl_cxgbei_new_pdu(flags); if (__predict_false(ip == NULL)) return (NULL); icl_cxgbei_new_pdu_set_conn(ip, ic); return (ip); } static size_t icl_pdu_data_segment_length(const struct icl_pdu *request) { uint32_t len = 0; len += request->ip_bhs->bhs_data_segment_len[0]; len <<= 8; len += request->ip_bhs->bhs_data_segment_len[1]; len <<= 8; len += request->ip_bhs->bhs_data_segment_len[2]; return (len); } size_t icl_cxgbei_conn_pdu_data_segment_length(struct icl_conn *ic, const struct icl_pdu *request) { return (icl_pdu_data_segment_length(request)); } static struct mbuf * finalize_pdu(struct icl_cxgbei_conn *icc, struct icl_cxgbei_pdu *icp) { struct icl_pdu *ip = &icp->ip; uint8_t ulp_submode, padding; struct mbuf *m, *last; struct iscsi_bhs *bhs; int data_len; /* * Fix up the data segment mbuf first. */ m = ip->ip_data_mbuf; ulp_submode = icc->ulp_submode; if (m != NULL) { last = m_last(m); /* * Round up the data segment to a 4B boundary. Pad with 0 if * necessary. There will definitely be room in the mbuf. */ padding = roundup2(ip->ip_data_len, 4) - ip->ip_data_len; if (padding != 0) { MPASS(padding <= M_TRAILINGSPACE(last)); bzero(mtod(last, uint8_t *) + last->m_len, padding); last->m_len += padding; } } else { MPASS(ip->ip_data_len == 0); ulp_submode &= ~ULP_CRC_DATA; padding = 0; } /* * Now the header mbuf that has the BHS. */ m = ip->ip_bhs_mbuf; MPASS(m->m_pkthdr.len == sizeof(struct iscsi_bhs)); MPASS(m->m_len == sizeof(struct iscsi_bhs)); bhs = ip->ip_bhs; data_len = ip->ip_data_len; if (data_len > icc->ic.ic_max_send_data_segment_length) { struct iscsi_bhs_data_in *bhsdi; int flags; KASSERT(padding == 0, ("%s: ISO with padding %d for icp %p", __func__, padding, icp)); switch (bhs->bhs_opcode) { case ISCSI_BHS_OPCODE_SCSI_DATA_OUT: flags = 1; break; case ISCSI_BHS_OPCODE_SCSI_DATA_IN: flags = 2; break; default: panic("invalid opcode %#x for ISO", bhs->bhs_opcode); } data_len = icc->ic.ic_max_send_data_segment_length; bhsdi = (struct iscsi_bhs_data_in *)bhs; if (bhsdi->bhsdi_flags & BHSDI_FLAGS_F) { /* * Firmware will set F on the final PDU in the * burst. */ flags |= CXGBE_ISO_F; bhsdi->bhsdi_flags &= ~BHSDI_FLAGS_F; } set_mbuf_iscsi_iso(m, true); set_mbuf_iscsi_iso_flags(m, flags); set_mbuf_iscsi_iso_mss(m, data_len); } bhs->bhs_data_segment_len[2] = data_len; bhs->bhs_data_segment_len[1] = data_len >> 8; bhs->bhs_data_segment_len[0] = data_len >> 16; /* * Extract mbuf chain from PDU. */ m->m_pkthdr.len += ip->ip_data_len + padding; m->m_next = ip->ip_data_mbuf; set_mbuf_ulp_submode(m, ulp_submode); ip->ip_bhs_mbuf = NULL; ip->ip_data_mbuf = NULL; ip->ip_bhs = NULL; /* * Drop PDU reference on icp. Additional references might * still be held by zero-copy PDU buffers (ICL_NOCOPY). */ if (atomic_fetchadd_int(&icp->ref_cnt, -1) == 1) icl_cxgbei_pdu_call_cb(ip); return (m); } +static void +cwt_push_pdus(struct icl_cxgbei_conn *icc, struct socket *so, struct mbufq *mq) +{ + struct epoch_tracker et; + struct icl_conn *ic = &icc->ic; + struct toepcb *toep = icc->toep; + struct inpcb *inp; + + /* + * Do not get inp from toep->inp as the toepcb might have + * detached already. + */ + inp = sotoinpcb(so); + CURVNET_SET(toep->vnet); + NET_EPOCH_ENTER(et); + INP_WLOCK(inp); + + ICL_CONN_UNLOCK(ic); + if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) || + __predict_false((toep->flags & TPF_ATTACHED) == 0)) { + mbufq_drain(mq); + } else { + mbufq_concat(&toep->ulp_pduq, mq); + t4_push_pdus(icc->sc, toep, 0); + } + INP_WUNLOCK(inp); + NET_EPOCH_EXIT(et); + CURVNET_RESTORE(); + + ICL_CONN_LOCK(ic); +} + +void +cwt_tx_main(void *arg) +{ + struct cxgbei_worker_thread *cwt = arg; + struct icl_cxgbei_conn *icc; + struct icl_conn *ic; + struct icl_pdu *ip; + struct socket *so; + struct mbuf *m; + struct mbufq mq; + STAILQ_HEAD(, icl_pdu) tx_pdus = STAILQ_HEAD_INITIALIZER(tx_pdus); + + MPASS(cwt != NULL); + + mtx_lock(&cwt->cwt_lock); + MPASS(cwt->cwt_state == 0); + cwt->cwt_state = CWT_RUNNING; + cv_signal(&cwt->cwt_cv); + + mbufq_init(&mq, INT_MAX); + while (__predict_true(cwt->cwt_state != CWT_STOP)) { + cwt->cwt_state = CWT_RUNNING; + while ((icc = TAILQ_FIRST(&cwt->icc_head)) != NULL) { + TAILQ_REMOVE(&cwt->icc_head, icc, tx_link); + mtx_unlock(&cwt->cwt_lock); + + ic = &icc->ic; + + ICL_CONN_LOCK(ic); + MPASS(icc->tx_active); + STAILQ_SWAP(&icc->sent_pdus, &tx_pdus, icl_pdu); + ICL_CONN_UNLOCK(ic); + + while ((ip = STAILQ_FIRST(&tx_pdus)) != NULL) { + STAILQ_REMOVE_HEAD(&tx_pdus, ip_next); + + m = finalize_pdu(icc, ip_to_icp(ip)); + M_ASSERTPKTHDR(m); + MPASS((m->m_pkthdr.len & 3) == 0); + + mbufq_enqueue(&mq, m); + } + + ICL_CONN_LOCK(ic); + so = ic->ic_socket; + if (__predict_false(ic->ic_disconnecting) || + __predict_false(so == NULL)) { + mbufq_drain(&mq); + icc->tx_active = false; + ICL_CONN_UNLOCK(ic); + + mtx_lock(&cwt->cwt_lock); + continue; + } + + cwt_push_pdus(icc, so, &mq); + + MPASS(icc->tx_active); + if (STAILQ_EMPTY(&icc->sent_pdus)) { + icc->tx_active = false; + ICL_CONN_UNLOCK(ic); + + mtx_lock(&cwt->cwt_lock); + } else { + ICL_CONN_UNLOCK(ic); + + /* + * More PDUs were queued while we were + * busy sending the previous batch. + * Re-add this connection to the end + * of the queue. + */ + mtx_lock(&cwt->cwt_lock); + TAILQ_INSERT_TAIL(&cwt->icc_head, icc, + tx_link); + } + } + + /* Inner loop doesn't check for CWT_STOP, do that first. */ + if (__predict_false(cwt->cwt_state == CWT_STOP)) + break; + cwt->cwt_state = CWT_SLEEPING; + cv_wait(&cwt->cwt_cv, &cwt->cwt_lock); + } + + MPASS(TAILQ_FIRST(&cwt->icc_head) == NULL); + mtx_unlock(&cwt->cwt_lock); + kthread_exit(); +} + int icl_cxgbei_conn_pdu_append_data(struct icl_conn *ic, struct icl_pdu *ip, const void *addr, size_t len, int flags) { struct icl_cxgbei_pdu *icp = ip_to_icp(ip); struct mbuf *m, *m_tail; const char *src; MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE); MPASS(ic == ip->ip_conn); KASSERT(len > 0, ("%s: len is %jd", __func__, (intmax_t)len)); m_tail = ip->ip_data_mbuf; if (m_tail != NULL) for (; m_tail->m_next != NULL; m_tail = m_tail->m_next) ; if (flags & ICL_NOCOPY) { m = m_get(flags & ~ICL_NOCOPY, MT_DATA); if (m == NULL) { ICL_WARN("failed to allocate mbuf"); return (ENOMEM); } m->m_flags |= M_RDONLY; m_extaddref(m, __DECONST(char *, addr), len, &icp->ref_cnt, icl_cxgbei_mbuf_done, icp, NULL); m->m_len = len; if (ip->ip_data_mbuf == NULL) { ip->ip_data_mbuf = m; ip->ip_data_len = len; } else { m_tail->m_next = m; m_tail = m_tail->m_next; ip->ip_data_len += len; } return (0); } src = (const char *)addr; /* Allocate as jumbo mbufs of size MJUM16BYTES. */ while (len >= MJUM16BYTES) { m = m_getjcl(M_NOWAIT, MT_DATA, 0, MJUM16BYTES); if (__predict_false(m == NULL)) { if ((flags & M_WAITOK) != 0) { /* Fall back to non-jumbo mbufs. */ break; } return (ENOMEM); } memcpy(mtod(m, void *), src, MJUM16BYTES); m->m_len = MJUM16BYTES; if (ip->ip_data_mbuf == NULL) { ip->ip_data_mbuf = m_tail = m; ip->ip_data_len = MJUM16BYTES; } else { m_tail->m_next = m; m_tail = m_tail->m_next; ip->ip_data_len += MJUM16BYTES; } src += MJUM16BYTES; len -= MJUM16BYTES; } /* Allocate mbuf chain for the remaining data. */ if (len != 0) { m = m_getm2(NULL, len, flags, MT_DATA, 0); if (__predict_false(m == NULL)) return (ENOMEM); if (ip->ip_data_mbuf == NULL) { ip->ip_data_mbuf = m; ip->ip_data_len = len; } else { m_tail->m_next = m; ip->ip_data_len += len; } for (; m != NULL; m = m->m_next) { m->m_len = min(len, M_SIZE(m)); memcpy(mtod(m, void *), src, m->m_len); src += m->m_len; len -= m->m_len; } MPASS(len == 0); } MPASS(ip->ip_data_len <= max(ic->ic_max_send_data_segment_length, ic->ic_hw_isomax)); return (0); } void icl_cxgbei_conn_pdu_get_data(struct icl_conn *ic, struct icl_pdu *ip, size_t off, void *addr, size_t len) { struct icl_cxgbei_pdu *icp = ip_to_icp(ip); if (icp->icp_flags & ICPF_RX_DDP) return; /* data is DDP'ed, no need to copy */ m_copydata(ip->ip_data_mbuf, off, len, addr); } void icl_cxgbei_conn_pdu_queue(struct icl_conn *ic, struct icl_pdu *ip) { icl_cxgbei_conn_pdu_queue_cb(ic, ip, NULL); } void icl_cxgbei_conn_pdu_queue_cb(struct icl_conn *ic, struct icl_pdu *ip, icl_pdu_cb cb) { - struct epoch_tracker et; struct icl_cxgbei_conn *icc = ic_to_icc(ic); struct icl_cxgbei_pdu *icp = ip_to_icp(ip); struct socket *so = ic->ic_socket; - struct toepcb *toep = icc->toep; - struct inpcb *inp; - struct mbuf *m; MPASS(ic == ip->ip_conn); MPASS(ip->ip_bhs_mbuf != NULL); /* The kernel doesn't generate PDUs with AHS. */ MPASS(ip->ip_ahs_mbuf == NULL && ip->ip_ahs_len == 0); ICL_CONN_LOCK_ASSERT(ic); icp->cb = cb; /* NOTE: sowriteable without so_snd lock is a mostly harmless race. */ if (ic->ic_disconnecting || so == NULL || !sowriteable(so)) { icl_cxgbei_pdu_done(ip, ENOTCONN); return; } - m = finalize_pdu(icc, icp); - M_ASSERTPKTHDR(m); - MPASS((m->m_pkthdr.len & 3) == 0); - - /* - * Do not get inp from toep->inp as the toepcb might have detached - * already. - */ - inp = sotoinpcb(so); - CURVNET_SET(toep->vnet); - NET_EPOCH_ENTER(et); - INP_WLOCK(inp); - if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) || - __predict_false((toep->flags & TPF_ATTACHED) == 0)) - m_freem(m); - else { - mbufq_enqueue(&toep->ulp_pduq, m); - t4_push_pdus(icc->sc, toep, 0); + STAILQ_INSERT_TAIL(&icc->sent_pdus, ip, ip_next); + if (!icc->tx_active) { + icc->tx_active = true; + cwt_queue_for_tx(icc); } - INP_WUNLOCK(inp); - NET_EPOCH_EXIT(et); - CURVNET_RESTORE(); } static struct icl_conn * icl_cxgbei_new_conn(const char *name, struct mtx *lock) { struct icl_cxgbei_conn *icc; struct icl_conn *ic; refcount_acquire(&icl_cxgbei_ncons); icc = (struct icl_cxgbei_conn *)kobj_create(&icl_cxgbei_class, M_CXGBE, M_WAITOK | M_ZERO); icc->icc_signature = CXGBEI_CONN_SIGNATURE; STAILQ_INIT(&icc->rcvd_pdus); + STAILQ_INIT(&icc->sent_pdus); icc->cmp_table = hashinit(64, M_CXGBEI, &icc->cmp_hash_mask); mtx_init(&icc->cmp_lock, "cxgbei_cmp", NULL, MTX_DEF); ic = &icc->ic; ic->ic_lock = lock; #ifdef DIAGNOSTIC refcount_init(&ic->ic_outstanding_pdus, 0); #endif ic->ic_name = name; ic->ic_offload = "cxgbei"; ic->ic_unmapped = false; CTR2(KTR_CXGBE, "%s: icc %p", __func__, icc); return (ic); } void icl_cxgbei_conn_free(struct icl_conn *ic) { struct icl_cxgbei_conn *icc = ic_to_icc(ic); MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); CTR2(KTR_CXGBE, "%s: icc %p", __func__, icc); mtx_destroy(&icc->cmp_lock); hashdestroy(icc->cmp_table, M_CXGBEI, icc->cmp_hash_mask); kobj_delete((struct kobj *)icc, M_CXGBE); refcount_release(&icl_cxgbei_ncons); } static int icl_cxgbei_setsockopt(struct icl_conn *ic, struct socket *so, int sspace, int rspace) { struct sockopt opt; int error, one = 1, ss, rs; ss = max(sendspace, sspace); rs = max(recvspace, rspace); error = soreserve(so, ss, rs); if (error != 0) { icl_cxgbei_conn_close(ic); return (error); } SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_flags |= SB_AUTOSIZE; SOCKBUF_UNLOCK(&so->so_snd); SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_flags |= SB_AUTOSIZE; SOCKBUF_UNLOCK(&so->so_rcv); /* * Disable Nagle. */ bzero(&opt, sizeof(opt)); opt.sopt_dir = SOPT_SET; opt.sopt_level = IPPROTO_TCP; opt.sopt_name = TCP_NODELAY; opt.sopt_val = &one; opt.sopt_valsize = sizeof(one); error = sosetopt(so, &opt); if (error != 0) { icl_cxgbei_conn_close(ic); return (error); } return (0); } /* * Request/response structure used to find out the adapter offloading a socket. */ struct find_ofld_adapter_rr { struct socket *so; struct adapter *sc; /* result */ }; static void find_offload_adapter(struct adapter *sc, void *arg) { struct find_ofld_adapter_rr *fa = arg; struct socket *so = fa->so; struct tom_data *td = sc->tom_softc; struct tcpcb *tp; struct inpcb *inp; /* Non-TCP were filtered out earlier. */ MPASS(so->so_proto->pr_protocol == IPPROTO_TCP); if (fa->sc != NULL) return; /* Found already. */ if (td == NULL) return; /* TOE not enabled on this adapter. */ inp = sotoinpcb(so); INP_WLOCK(inp); if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { tp = intotcpcb(inp); if (tp->t_flags & TF_TOE && tp->tod == &td->tod) fa->sc = sc; /* Found. */ } INP_WUNLOCK(inp); } static bool is_memfree(struct adapter *sc) { uint32_t em; em = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE); if ((em & F_EXT_MEM_ENABLE) != 0) return (false); if (is_t5(sc) && (em & F_EXT_MEM1_ENABLE) != 0) return (false); return (true); } /* XXXNP: move this to t4_tom. */ static void send_iscsi_flowc_wr(struct adapter *sc, struct toepcb *toep, int maxlen) { struct wrqe *wr; struct fw_flowc_wr *flowc; const u_int nparams = 1; u_int flowclen; struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); wr = alloc_wrqe(roundup2(flowclen, 16), &toep->ofld_txq->wrq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } flowc = wrtod(wr); memset(flowc, 0, wr->wr_len); flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | V_FW_FLOWC_WR_NPARAMS(nparams)); flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | V_FW_WR_FLOWID(toep->tid)); flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_TXDATAPLEN_MAX; flowc->mnemval[0].val = htobe32(maxlen); txsd->tx_credits = howmany(flowclen, 16); txsd->plen = 0; KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, ("%s: not enough credits (%d)", __func__, toep->tx_credits)); toep->tx_credits -= txsd->tx_credits; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) toep->txsd_pidx = 0; toep->txsd_avail--; t4_wrq_tx(sc, wr); } static void set_ulp_mode_iscsi(struct adapter *sc, struct toepcb *toep, u_int ulp_submode) { uint64_t val; CTR3(KTR_CXGBE, "%s: tid %u, ULP_MODE_ISCSI, submode=%#x", __func__, toep->tid, ulp_submode); val = V_TCB_ULP_TYPE(ULP_MODE_ISCSI) | V_TCB_ULP_RAW(ulp_submode); t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_ULP_TYPE, V_TCB_ULP_TYPE(M_TCB_ULP_TYPE) | V_TCB_ULP_RAW(M_TCB_ULP_RAW), val, 0, 0); val = V_TF_RX_FLOW_CONTROL_DISABLE(1ULL); t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS, val, val, 0, 0); } /* * XXXNP: Who is responsible for cleaning up the socket if this returns with an * error? Review all error paths. * * XXXNP: What happens to the socket's fd reference if the operation is * successful, and how does that affect the socket's life cycle? */ int icl_cxgbei_conn_handoff(struct icl_conn *ic, int fd) { struct icl_cxgbei_conn *icc = ic_to_icc(ic); struct find_ofld_adapter_rr fa; struct file *fp; struct socket *so; struct inpcb *inp; struct tcpcb *tp; struct toepcb *toep; cap_rights_t rights; u_int max_rx_pdu_len, max_tx_pdu_len; int error, max_iso_pdus; MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); ICL_CONN_LOCK_ASSERT_NOT(ic); /* * Steal the socket from userland. */ error = fget(curthread, fd, cap_rights_init_one(&rights, CAP_SOCK_CLIENT), &fp); if (error != 0) return (error); if (fp->f_type != DTYPE_SOCKET) { fdrop(fp, curthread); return (EINVAL); } so = fp->f_data; if (so->so_type != SOCK_STREAM || so->so_proto->pr_protocol != IPPROTO_TCP) { fdrop(fp, curthread); return (EINVAL); } ICL_CONN_LOCK(ic); if (ic->ic_socket != NULL) { ICL_CONN_UNLOCK(ic); fdrop(fp, curthread); return (EBUSY); } ic->ic_disconnecting = false; ic->ic_socket = so; fp->f_ops = &badfileops; fp->f_data = NULL; fdrop(fp, curthread); ICL_CONN_UNLOCK(ic); /* Find the adapter offloading this socket. */ fa.sc = NULL; fa.so = so; t4_iterate(find_offload_adapter, &fa); if (fa.sc == NULL) return (EINVAL); icc->sc = fa.sc; max_rx_pdu_len = ISCSI_BHS_SIZE + ic->ic_max_recv_data_segment_length; max_tx_pdu_len = ISCSI_BHS_SIZE + ic->ic_max_send_data_segment_length; if (ic->ic_header_crc32c) { max_rx_pdu_len += ISCSI_HEADER_DIGEST_SIZE; max_tx_pdu_len += ISCSI_HEADER_DIGEST_SIZE; } if (ic->ic_data_crc32c) { max_rx_pdu_len += ISCSI_DATA_DIGEST_SIZE; max_tx_pdu_len += ISCSI_DATA_DIGEST_SIZE; } inp = sotoinpcb(so); INP_WLOCK(inp); tp = intotcpcb(inp); if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) { INP_WUNLOCK(inp); return (EBUSY); } /* * socket could not have been "unoffloaded" if here. */ MPASS(tp->t_flags & TF_TOE); MPASS(tp->tod != NULL); MPASS(tp->t_toe != NULL); toep = tp->t_toe; MPASS(toep->vi->adapter == icc->sc); if (ulp_mode(toep) != ULP_MODE_NONE) { INP_WUNLOCK(inp); return (EINVAL); } icc->toep = toep; icc->cwt = cxgbei_select_worker_thread(icc); icc->ulp_submode = 0; if (ic->ic_header_crc32c) icc->ulp_submode |= ULP_CRC_HEADER; if (ic->ic_data_crc32c) icc->ulp_submode |= ULP_CRC_DATA; if (icc->sc->tt.iso && chip_id(icc->sc) >= CHELSIO_T5 && !is_memfree(icc->sc)) { max_iso_pdus = CXGBEI_MAX_ISO_PAYLOAD / max_tx_pdu_len; ic->ic_hw_isomax = max_iso_pdus * ic->ic_max_send_data_segment_length; } else max_iso_pdus = 1; toep->params.ulp_mode = ULP_MODE_ISCSI; toep->ulpcb = icc; send_iscsi_flowc_wr(icc->sc, toep, roundup(max_iso_pdus * max_tx_pdu_len, tp->t_maxseg)); set_ulp_mode_iscsi(icc->sc, toep, icc->ulp_submode); INP_WUNLOCK(inp); return (icl_cxgbei_setsockopt(ic, so, max_tx_pdu_len, max_rx_pdu_len)); } void icl_cxgbei_conn_close(struct icl_conn *ic) { struct icl_cxgbei_conn *icc = ic_to_icc(ic); struct icl_pdu *ip; struct socket *so; struct sockbuf *sb; struct inpcb *inp; struct toepcb *toep = icc->toep; MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); ICL_CONN_LOCK_ASSERT_NOT(ic); ICL_CONN_LOCK(ic); so = ic->ic_socket; if (ic->ic_disconnecting || so == NULL) { CTR4(KTR_CXGBE, "%s: icc %p (disconnecting = %d), so %p", __func__, icc, ic->ic_disconnecting, so); ICL_CONN_UNLOCK(ic); return; } ic->ic_disconnecting = true; #ifdef DIAGNOSTIC KASSERT(ic->ic_outstanding_pdus == 0, ("destroying session with %d outstanding PDUs", ic->ic_outstanding_pdus)); #endif ICL_CONN_UNLOCK(ic); CTR3(KTR_CXGBE, "%s: tid %d, icc %p", __func__, toep ? toep->tid : -1, icc); inp = sotoinpcb(so); sb = &so->so_rcv; INP_WLOCK(inp); if (toep != NULL) { /* NULL if connection was never offloaded. */ toep->ulpcb = NULL; + /* + * Wait for the cwt threads to stop processing this + * connection for transmit. + */ + while (icc->tx_active) + rw_sleep(inp, &inp->inp_lock, 0, "conclo", 1); + /* Discard PDUs queued for TX. */ + while (!STAILQ_EMPTY(&icc->sent_pdus)) { + ip = STAILQ_FIRST(&icc->sent_pdus); + STAILQ_REMOVE_HEAD(&icc->sent_pdus, ip_next); + icl_cxgbei_pdu_done(ip, ENOTCONN); + } mbufq_drain(&toep->ulp_pduq); /* * Wait for the cwt threads to stop processing this - * connection. + * connection for receive. */ SOCKBUF_LOCK(sb); - if (icc->rx_flags & RXF_ACTIVE) { - volatile u_int *p = &icc->rx_flags; + if (icc->rx_active) { + volatile bool *p = &icc->rx_active; SOCKBUF_UNLOCK(sb); INP_WUNLOCK(inp); - while (*p & RXF_ACTIVE) + while (*p) pause("conclo", 1); INP_WLOCK(inp); SOCKBUF_LOCK(sb); } /* * Discard received PDUs not passed to the iSCSI * layer. */ while (!STAILQ_EMPTY(&icc->rcvd_pdus)) { ip = STAILQ_FIRST(&icc->rcvd_pdus); STAILQ_REMOVE_HEAD(&icc->rcvd_pdus, ip_next); icl_cxgbei_pdu_done(ip, ENOTCONN); } SOCKBUF_UNLOCK(sb); /* * Grab a reference to use when waiting for the final * CPL to be received. If toep->inp is NULL, then * final_cpl_received() has already been called (e.g. * due to the peer sending a RST). */ if (toep->inp != NULL) { toep = hold_toepcb(toep); toep->flags |= TPF_WAITING_FOR_FINAL; } else toep = NULL; } INP_WUNLOCK(inp); ICL_CONN_LOCK(ic); ic->ic_socket = NULL; ICL_CONN_UNLOCK(ic); /* * XXXNP: we should send RST instead of FIN when PDUs held in various * queues were purged instead of delivered reliably but soabort isn't * really general purpose and wouldn't do the right thing here. */ soclose(so); /* * Wait for the socket to fully close. This ensures any * pending received data has been received (and in particular, * any data that would be received by DDP has been handled). * Callers assume that it is safe to free buffers for tasks * and transfers after this function returns. */ if (toep != NULL) { struct mtx *lock = mtx_pool_find(mtxpool_sleep, toep); mtx_lock(lock); while ((toep->flags & TPF_WAITING_FOR_FINAL) != 0) mtx_sleep(toep, lock, PSOCK, "conclo2", 0); mtx_unlock(lock); free_toepcb(toep); } } static void cxgbei_insert_cmp(struct icl_cxgbei_conn *icc, struct cxgbei_cmp *cmp, uint32_t tt) { #ifdef INVARIANTS struct cxgbei_cmp *cmp2; #endif cmp->tt = tt; mtx_lock(&icc->cmp_lock); #ifdef INVARIANTS LIST_FOREACH(cmp2, &icc->cmp_table[TT_HASH(icc, tt)], link) { KASSERT(cmp2->tt != tt, ("%s: duplicate cmp", __func__)); } #endif LIST_INSERT_HEAD(&icc->cmp_table[TT_HASH(icc, tt)], cmp, link); mtx_unlock(&icc->cmp_lock); } struct cxgbei_cmp * cxgbei_find_cmp(struct icl_cxgbei_conn *icc, uint32_t tt) { struct cxgbei_cmp *cmp; mtx_lock(&icc->cmp_lock); LIST_FOREACH(cmp, &icc->cmp_table[TT_HASH(icc, tt)], link) { if (cmp->tt == tt) break; } mtx_unlock(&icc->cmp_lock); return (cmp); } static void cxgbei_rm_cmp(struct icl_cxgbei_conn *icc, struct cxgbei_cmp *cmp) { #ifdef INVARIANTS struct cxgbei_cmp *cmp2; #endif mtx_lock(&icc->cmp_lock); #ifdef INVARIANTS LIST_FOREACH(cmp2, &icc->cmp_table[TT_HASH(icc, cmp->tt)], link) { if (cmp2 == cmp) goto found; } panic("%s: could not find cmp", __func__); found: #endif LIST_REMOVE(cmp, link); mtx_unlock(&icc->cmp_lock); } int icl_cxgbei_conn_task_setup(struct icl_conn *ic, struct icl_pdu *ip, struct ccb_scsiio *csio, uint32_t *ittp, void **arg) { struct icl_cxgbei_conn *icc = ic_to_icc(ic); struct toepcb *toep = icc->toep; struct adapter *sc = icc->sc; struct cxgbei_data *ci = sc->iscsi_ulp_softc; struct ppod_region *pr = &ci->pr; struct cxgbei_ddp_state *ddp; struct ppod_reservation *prsv; struct inpcb *inp; struct mbufq mq; uint32_t itt; int rc = 0; ICL_CONN_LOCK_ASSERT(ic); /* This is for the offload driver's state. Must not be set already. */ MPASS(arg != NULL); MPASS(*arg == NULL); if ((csio->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_IN || csio->dxfer_len < ci->ddp_threshold || ic->ic_disconnecting || ic->ic_socket == NULL) { no_ddp: /* * No DDP for this I/O. Allocate an ITT (based on the one * passed in) that cannot be a valid hardware DDP tag in the * iSCSI region. */ itt = *ittp & M_PPOD_TAG; itt = V_PPOD_TAG(itt) | pr->pr_invalid_bit; *ittp = htobe32(itt); MPASS(*arg == NULL); /* State is maintained for DDP only. */ if (rc != 0) counter_u64_add( toep->ofld_rxq->rx_iscsi_ddp_setup_error, 1); return (0); } /* * Reserve resources for DDP, update the itt that should be used in the * PDU, and save DDP specific state for this I/O in *arg. */ ddp = malloc(sizeof(*ddp), M_CXGBEI, M_NOWAIT | M_ZERO); if (ddp == NULL) { rc = ENOMEM; goto no_ddp; } prsv = &ddp->prsv; /* XXX add support for all CAM_DATA_ types */ MPASS((csio->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_VADDR); rc = t4_alloc_page_pods_for_buf(pr, (vm_offset_t)csio->data_ptr, csio->dxfer_len, prsv); if (rc != 0) { free(ddp, M_CXGBEI); goto no_ddp; } mbufq_init(&mq, INT_MAX); rc = t4_write_page_pods_for_buf(sc, toep, prsv, (vm_offset_t)csio->data_ptr, csio->dxfer_len, &mq); if (__predict_false(rc != 0)) { mbufq_drain(&mq); t4_free_page_pods(prsv); free(ddp, M_CXGBEI); goto no_ddp; } /* * Do not get inp from toep->inp as the toepcb might have * detached already. */ inp = sotoinpcb(ic->ic_socket); INP_WLOCK(inp); if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) != 0) { INP_WUNLOCK(inp); mbufq_drain(&mq); t4_free_page_pods(prsv); free(ddp, M_CXGBEI); goto no_ddp; } mbufq_concat(&toep->ulp_pduq, &mq); INP_WUNLOCK(inp); ddp->cmp.last_datasn = -1; cxgbei_insert_cmp(icc, &ddp->cmp, prsv->prsv_tag); *ittp = htobe32(prsv->prsv_tag); *arg = prsv; counter_u64_add(toep->ofld_rxq->rx_iscsi_ddp_setup_ok, 1); return (0); } void icl_cxgbei_conn_task_done(struct icl_conn *ic, void *arg) { if (arg != NULL) { struct cxgbei_ddp_state *ddp = arg; cxgbei_rm_cmp(ic_to_icc(ic), &ddp->cmp); t4_free_page_pods(&ddp->prsv); free(ddp, M_CXGBEI); } } static inline bool ddp_sgl_check(struct ctl_sg_entry *sg, int entries, int xferlen) { #ifdef INVARIANTS int total_len = 0; #endif MPASS(entries > 0); if (((vm_offset_t)sg[--entries].addr & 3U) != 0) return (false); #ifdef INVARIANTS total_len += sg[entries].len; #endif while (--entries >= 0) { if (((vm_offset_t)sg[entries].addr & PAGE_MASK) != 0 || (sg[entries].len % PAGE_SIZE) != 0) return (false); #ifdef INVARIANTS total_len += sg[entries].len; #endif } MPASS(total_len == xferlen); return (true); } /* XXXNP: PDU should be passed in as parameter, like on the initiator. */ #define io_to_request_pdu(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr) #define io_to_ddp_state(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND2].ptr) int icl_cxgbei_conn_transfer_setup(struct icl_conn *ic, union ctl_io *io, uint32_t *tttp, void **arg) { struct icl_cxgbei_conn *icc = ic_to_icc(ic); struct toepcb *toep = icc->toep; struct ctl_scsiio *ctsio = &io->scsiio; struct adapter *sc = icc->sc; struct cxgbei_data *ci = sc->iscsi_ulp_softc; struct ppod_region *pr = &ci->pr; struct cxgbei_ddp_state *ddp; struct ppod_reservation *prsv; struct ctl_sg_entry *sgl, sg_entry; struct inpcb *inp; struct mbufq mq; int sg_entries = ctsio->kern_sg_entries; uint32_t ttt; int xferlen, rc = 0, alias; /* This is for the offload driver's state. Must not be set already. */ MPASS(arg != NULL); MPASS(*arg == NULL); if (ctsio->ext_data_filled == 0) { int first_burst; struct icl_pdu *ip = io_to_request_pdu(io); #ifdef INVARIANTS struct icl_cxgbei_pdu *icp = ip_to_icp(ip); MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE); MPASS(ic == ip->ip_conn); MPASS(ip->ip_bhs_mbuf != NULL); #endif first_burst = icl_pdu_data_segment_length(ip); /* * Note that ICL calls conn_transfer_setup even if the first * burst had everything and there's nothing left to transfer. * * NB: The CTL frontend might have provided a buffer * whose length (kern_data_len) is smaller than the * FirstBurstLength of unsolicited data. Treat those * as an empty transfer. */ xferlen = ctsio->kern_data_len; if (xferlen < first_burst || xferlen - first_burst < ci->ddp_threshold) { no_ddp: /* * No DDP for this transfer. Allocate a TTT (based on * the one passed in) that cannot be a valid hardware * DDP tag in the iSCSI region. */ ttt = *tttp & M_PPOD_TAG; ttt = V_PPOD_TAG(ttt) | pr->pr_invalid_bit; *tttp = htobe32(ttt); MPASS(io_to_ddp_state(io) == NULL); if (rc != 0) counter_u64_add( toep->ofld_rxq->rx_iscsi_ddp_setup_error, 1); return (0); } if (sg_entries == 0) { sgl = &sg_entry; sgl->len = xferlen; sgl->addr = (void *)ctsio->kern_data_ptr; sg_entries = 1; } else sgl = (void *)ctsio->kern_data_ptr; if (!ddp_sgl_check(sgl, sg_entries, xferlen)) goto no_ddp; /* * Reserve resources for DDP, update the ttt that should be used * in the PDU, and save DDP specific state for this I/O. */ MPASS(io_to_ddp_state(io) == NULL); ddp = malloc(sizeof(*ddp), M_CXGBEI, M_NOWAIT | M_ZERO); if (ddp == NULL) { rc = ENOMEM; goto no_ddp; } prsv = &ddp->prsv; rc = t4_alloc_page_pods_for_sgl(pr, sgl, sg_entries, prsv); if (rc != 0) { free(ddp, M_CXGBEI); goto no_ddp; } mbufq_init(&mq, INT_MAX); rc = t4_write_page_pods_for_sgl(sc, toep, prsv, sgl, sg_entries, xferlen, &mq); if (__predict_false(rc != 0)) { mbufq_drain(&mq); t4_free_page_pods(prsv); free(ddp, M_CXGBEI); goto no_ddp; } /* * Do not get inp from toep->inp as the toepcb might * have detached already. */ ICL_CONN_LOCK(ic); if (ic->ic_disconnecting || ic->ic_socket == NULL) { ICL_CONN_UNLOCK(ic); mbufq_drain(&mq); t4_free_page_pods(prsv); free(ddp, M_CXGBEI); return (ECONNRESET); } inp = sotoinpcb(ic->ic_socket); INP_WLOCK(inp); ICL_CONN_UNLOCK(ic); if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) != 0) { INP_WUNLOCK(inp); mbufq_drain(&mq); t4_free_page_pods(prsv); free(ddp, M_CXGBEI); return (ECONNRESET); } mbufq_concat(&toep->ulp_pduq, &mq); INP_WUNLOCK(inp); ddp->cmp.next_buffer_offset = ctsio->kern_rel_offset + first_burst; ddp->cmp.last_datasn = -1; cxgbei_insert_cmp(icc, &ddp->cmp, prsv->prsv_tag); *tttp = htobe32(prsv->prsv_tag); io_to_ddp_state(io) = ddp; *arg = ctsio; counter_u64_add(toep->ofld_rxq->rx_iscsi_ddp_setup_ok, 1); return (0); } /* * In the middle of an I/O. A non-NULL page pod reservation indicates * that a DDP buffer is being used for the I/O. */ ddp = io_to_ddp_state(ctsio); if (ddp == NULL) goto no_ddp; prsv = &ddp->prsv; alias = (prsv->prsv_tag & pr->pr_alias_mask) >> pr->pr_alias_shift; alias++; prsv->prsv_tag &= ~pr->pr_alias_mask; prsv->prsv_tag |= alias << pr->pr_alias_shift & pr->pr_alias_mask; ddp->cmp.last_datasn = -1; cxgbei_insert_cmp(icc, &ddp->cmp, prsv->prsv_tag); *tttp = htobe32(prsv->prsv_tag); *arg = ctsio; return (0); } void icl_cxgbei_conn_transfer_done(struct icl_conn *ic, void *arg) { struct ctl_scsiio *ctsio = arg; if (ctsio != NULL) { struct cxgbei_ddp_state *ddp; ddp = io_to_ddp_state(ctsio); MPASS(ddp != NULL); cxgbei_rm_cmp(ic_to_icc(ic), &ddp->cmp); if (ctsio->kern_data_len == ctsio->ext_data_filled || ic->ic_disconnecting) { t4_free_page_pods(&ddp->prsv); free(ddp, M_CXGBEI); io_to_ddp_state(ctsio) = NULL; } } } static void cxgbei_limits(struct adapter *sc, void *arg) { struct icl_drv_limits *idl = arg; struct cxgbei_data *ci; int max_dsl; if (begin_synchronized_op(sc, NULL, HOLD_LOCK, "t4lims") != 0) return; if (uld_active(sc, ULD_ISCSI)) { ci = sc->iscsi_ulp_softc; MPASS(ci != NULL); max_dsl = ci->max_rx_data_len; if (idl->idl_max_recv_data_segment_length > max_dsl) idl->idl_max_recv_data_segment_length = max_dsl; max_dsl = ci->max_tx_data_len; if (idl->idl_max_send_data_segment_length > max_dsl) idl->idl_max_send_data_segment_length = max_dsl; } end_synchronized_op(sc, LOCK_HELD); } static int icl_cxgbei_limits(struct icl_drv_limits *idl) { /* Maximum allowed by the RFC. cxgbei_limits will clip them. */ idl->idl_max_recv_data_segment_length = (1 << 24) - 1; idl->idl_max_send_data_segment_length = (1 << 24) - 1; /* These are somewhat arbitrary. */ idl->idl_max_burst_length = max_burst_length; idl->idl_first_burst_length = first_burst_length; t4_iterate(cxgbei_limits, idl); return (0); } int icl_cxgbei_mod_load(void) { int rc; refcount_init(&icl_cxgbei_ncons, 0); rc = icl_register("cxgbei", false, -100, icl_cxgbei_limits, icl_cxgbei_new_conn); return (rc); } int icl_cxgbei_mod_unload(void) { if (icl_cxgbei_ncons != 0) return (EBUSY); icl_unregister("cxgbei", false); return (0); } #endif