diff --git a/sys/dev/cxgbe/tom/t4_cpl_io.c b/sys/dev/cxgbe/tom/t4_cpl_io.c index 9ecb4aeee939..a7b4162db3b5 100644 --- a/sys/dev/cxgbe/tom/t4_cpl_io.c +++ b/sys/dev/cxgbe/tom/t4_cpl_io.c @@ -1,2467 +1,2552 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2012, 2015 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_ratelimit.h" #ifdef TCP_OFFLOAD #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include #include #include #include #include #include #include #include #include #include #include #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_tcb.h" #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" static void t4_aiotx_cancel(struct kaiocb *job); static void t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep); void send_flowc_wr(struct toepcb *toep, struct tcpcb *tp) { struct wrqe *wr; struct fw_flowc_wr *flowc; unsigned int nparams, flowclen, paramidx; struct vi_info *vi = toep->vi; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; unsigned int pfvf = sc->pf << S_FW_VIID_PFN; struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT), ("%s: flowc for tid %u sent already", __func__, toep->tid)); if (tp != NULL) nparams = 8; else nparams = 6; if (toep->params.tc_idx != -1) { MPASS(toep->params.tc_idx >= 0 && toep->params.tc_idx < sc->params.nsched_cls); nparams++; } flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); wr = alloc_wrqe(roundup2(flowclen, 16), &toep->ofld_txq->wrq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } flowc = wrtod(wr); memset(flowc, 0, wr->wr_len); flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | V_FW_FLOWC_WR_NPARAMS(nparams)); flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | V_FW_WR_FLOWID(toep->tid)); #define FLOWC_PARAM(__m, __v) \ do { \ flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \ flowc->mnemval[paramidx].val = htobe32(__v); \ paramidx++; \ } while (0) paramidx = 0; FLOWC_PARAM(PFNVFN, pfvf); FLOWC_PARAM(CH, pi->tx_chan); FLOWC_PARAM(PORT, pi->tx_chan); FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id); FLOWC_PARAM(SNDBUF, toep->params.sndbuf); if (tp) { FLOWC_PARAM(MSS, toep->params.emss); FLOWC_PARAM(SNDNXT, tp->snd_nxt); FLOWC_PARAM(RCVNXT, tp->rcv_nxt); } else FLOWC_PARAM(MSS, 512); CTR6(KTR_CXGBE, "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x", __func__, toep->tid, toep->params.emss, toep->params.sndbuf, tp ? tp->snd_nxt : 0, tp ? tp->rcv_nxt : 0); if (toep->params.tc_idx != -1) FLOWC_PARAM(SCHEDCLASS, toep->params.tc_idx); #undef FLOWC_PARAM KASSERT(paramidx == nparams, ("nparams mismatch")); KASSERT(howmany(flowclen, 16) <= MAX_OFLD_TX_SDESC_CREDITS, ("%s: tx_credits %u too large", __func__, howmany(flowclen, 16))); txsd->tx_credits = howmany(flowclen, 16); txsd->plen = 0; KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, ("%s: not enough credits (%d)", __func__, toep->tx_credits)); toep->tx_credits -= txsd->tx_credits; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) toep->txsd_pidx = 0; toep->txsd_avail--; toep->flags |= TPF_FLOWC_WR_SENT; t4_wrq_tx(sc, wr); } #ifdef RATELIMIT /* * Input is Bytes/second (so_max_pacing_rate), chip counts in Kilobits/second. */ static int update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps) { int tc_idx, rc; const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000; const int port_id = toep->vi->pi->port_id; CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps); if (kbps == 0) { /* unbind */ tc_idx = -1; } else { rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx); if (rc != 0) return (rc); MPASS(tc_idx >= 0 && tc_idx < sc->params.nsched_cls); } if (toep->params.tc_idx != tc_idx) { struct wrqe *wr; struct fw_flowc_wr *flowc; int nparams = 1, flowclen, flowclen16; struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); flowclen16 = howmany(flowclen, 16); if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 || (wr = alloc_wrqe(roundup2(flowclen, 16), &toep->ofld_txq->wrq)) == NULL) { if (tc_idx >= 0) t4_release_cl_rl(sc, port_id, tc_idx); return (ENOMEM); } flowc = wrtod(wr); memset(flowc, 0, wr->wr_len); flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | V_FW_FLOWC_WR_NPARAMS(nparams)); flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) | V_FW_WR_FLOWID(toep->tid)); flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS; if (tc_idx == -1) flowc->mnemval[0].val = htobe32(0xff); else flowc->mnemval[0].val = htobe32(tc_idx); KASSERT(flowclen16 <= MAX_OFLD_TX_SDESC_CREDITS, ("%s: tx_credits %u too large", __func__, flowclen16)); txsd->tx_credits = flowclen16; txsd->plen = 0; toep->tx_credits -= txsd->tx_credits; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) toep->txsd_pidx = 0; toep->txsd_avail--; t4_wrq_tx(sc, wr); } if (toep->params.tc_idx >= 0) t4_release_cl_rl(sc, port_id, toep->params.tc_idx); toep->params.tc_idx = tc_idx; return (0); } #endif void send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt) { struct wrqe *wr; struct cpl_abort_req *req; int tid = toep->tid; struct inpcb *inp = toep->inp; struct tcpcb *tp = intotcpcb(inp); /* don't use if INP_DROPPED */ INP_WLOCK_ASSERT(inp); CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s", __func__, toep->tid, inp->inp_flags & INP_DROPPED ? "inp dropped" : tcpstates[tp->t_state], toep->flags, inp->inp_flags, toep->flags & TPF_ABORT_SHUTDOWN ? " (abort already in progress)" : ""); if (toep->flags & TPF_ABORT_SHUTDOWN) return; /* abort already in progress */ toep->flags |= TPF_ABORT_SHUTDOWN; KASSERT(toep->flags & TPF_FLOWC_WR_SENT, ("%s: flowc_wr not sent for tid %d.", __func__, tid)); wr = alloc_wrqe(sizeof(*req), &toep->ofld_txq->wrq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } req = wrtod(wr); INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid); if (inp->inp_flags & INP_DROPPED) req->rsvd0 = htobe32(snd_nxt); else req->rsvd0 = htobe32(tp->snd_nxt); req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT); req->cmd = CPL_ABORT_SEND_RST; /* * XXX: What's the correct way to tell that the inp hasn't been detached * from its socket? Should I even be flushing the snd buffer here? */ if ((inp->inp_flags & INP_DROPPED) == 0) { struct socket *so = inp->inp_socket; if (so != NULL) /* because I'm not sure. See comment above */ sbflush(&so->so_snd); } t4_l2t_send(sc, wr, toep->l2te); } /* * Called when a connection is established to translate the TCP options * reported by HW to FreeBSD's native format. */ static void assign_rxopt(struct tcpcb *tp, uint16_t opt) { struct toepcb *toep = tp->t_toe; struct inpcb *inp = tptoinpcb(tp); struct adapter *sc = td_adapter(toep->td); INP_LOCK_ASSERT(inp); toep->params.mtu_idx = G_TCPOPT_MSS(opt); tp->t_maxseg = sc->params.mtus[toep->params.mtu_idx]; if (inp->inp_inc.inc_flags & INC_ISIPV6) tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr); toep->params.emss = tp->t_maxseg; if (G_TCPOPT_TSTAMP(opt)) { toep->params.tstamp = 1; toep->params.emss -= TCPOLEN_TSTAMP_APPA; tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */ tp->ts_recent = 0; /* hmmm */ tp->ts_recent_age = tcp_ts_getticks(); } else toep->params.tstamp = 0; if (G_TCPOPT_SACK(opt)) { toep->params.sack = 1; tp->t_flags |= TF_SACK_PERMIT; /* should already be set */ } else { toep->params.sack = 0; tp->t_flags &= ~TF_SACK_PERMIT; /* sack disallowed by peer */ } if (G_TCPOPT_WSCALE_OK(opt)) tp->t_flags |= TF_RCVD_SCALE; /* Doing window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == (TF_RCVD_SCALE | TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; tp->snd_scale = G_TCPOPT_SND_WSCALE(opt); } else toep->params.wscale = 0; CTR6(KTR_CXGBE, "assign_rxopt: tid %d, mtu_idx %u, emss %u, ts %u, sack %u, wscale %u", toep->tid, toep->params.mtu_idx, toep->params.emss, toep->params.tstamp, toep->params.sack, toep->params.wscale); } /* * Completes some final bits of initialization for just established connections * and changes their state to TCPS_ESTABLISHED. * * The ISNs are from the exchange of SYNs. */ void make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt) { struct inpcb *inp = toep->inp; struct socket *so = inp->inp_socket; struct tcpcb *tp = intotcpcb(inp); uint16_t tcpopt = be16toh(opt); INP_WLOCK_ASSERT(inp); KASSERT(tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED, ("%s: TCP state %s", __func__, tcpstates[tp->t_state])); CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p", __func__, toep->tid, so, inp, tp, toep); tcp_state_change(tp, TCPS_ESTABLISHED); tp->t_starttime = ticks; TCPSTAT_INC(tcps_connects); tp->irs = irs; tcp_rcvseqinit(tp); tp->rcv_wnd = (u_int)toep->params.opt0_bufsize << 10; tp->rcv_adv += tp->rcv_wnd; tp->last_ack_sent = tp->rcv_nxt; tp->iss = iss; tcp_sendseqinit(tp); tp->snd_una = iss + 1; tp->snd_nxt = iss + 1; tp->snd_max = iss + 1; assign_rxopt(tp, tcpopt); send_flowc_wr(toep, tp); soisconnected(so); } int send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits) { struct wrqe *wr; struct cpl_rx_data_ack *req; uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); KASSERT(credits >= 0, ("%s: %d credits", __func__, credits)); wr = alloc_wrqe(sizeof(*req), toep->ctrlq); if (wr == NULL) return (0); req = wrtod(wr); INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); req->credit_dack = htobe32(dack | V_RX_CREDITS(credits)); t4_wrq_tx(sc, wr); return (credits); } void t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp) { struct adapter *sc = tod->tod_softc; struct inpcb *inp = tptoinpcb(tp); struct socket *so = inp->inp_socket; struct sockbuf *sb = &so->so_rcv; struct toepcb *toep = tp->t_toe; int rx_credits; INP_WLOCK_ASSERT(inp); SOCKBUF_LOCK_ASSERT(sb); rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; if (rx_credits > 0 && (tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 || (rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) || sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) { rx_credits = send_rx_credits(sc, toep, rx_credits); tp->rcv_wnd += rx_credits; tp->rcv_adv += rx_credits; } } void t4_rcvd(struct toedev *tod, struct tcpcb *tp) { struct inpcb *inp = tptoinpcb(tp); struct socket *so = inp->inp_socket; struct sockbuf *sb = &so->so_rcv; SOCKBUF_LOCK(sb); t4_rcvd_locked(tod, tp); SOCKBUF_UNLOCK(sb); } /* * Close a connection by sending a CPL_CLOSE_CON_REQ message. */ int t4_close_conn(struct adapter *sc, struct toepcb *toep) { struct wrqe *wr; struct cpl_close_con_req *req; unsigned int tid = toep->tid; CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid, toep->flags & TPF_FIN_SENT ? ", IGNORED" : ""); if (toep->flags & TPF_FIN_SENT) return (0); KASSERT(toep->flags & TPF_FLOWC_WR_SENT, ("%s: flowc_wr not sent for tid %u.", __func__, tid)); wr = alloc_wrqe(sizeof(*req), &toep->ofld_txq->wrq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } req = wrtod(wr); req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) | V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr))); req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) | V_FW_WR_FLOWID(tid)); req->wr.wr_lo = cpu_to_be64(0); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); req->rsvd = 0; toep->flags |= TPF_FIN_SENT; toep->flags &= ~TPF_SEND_FIN; t4_l2t_send(sc, wr, toep->l2te); return (0); } #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16) #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16)) #define MIN_ISO_TX_CREDITS (howmany(sizeof(struct cpl_tx_data_iso), 16)) #define MIN_TX_CREDITS(iso) \ (MIN_OFLD_TX_CREDITS + ((iso) ? MIN_ISO_TX_CREDITS : 0)) _Static_assert(MAX_OFLD_TX_CREDITS <= MAX_OFLD_TX_SDESC_CREDITS, "MAX_OFLD_TX_SDESC_CREDITS too small"); /* Maximum amount of immediate data we could stuff in a WR */ static inline int max_imm_payload(int tx_credits, int iso) { const int iso_cpl_size = iso ? sizeof(struct cpl_tx_data_iso) : 0; const int n = 1; /* Use no more than one desc for imm. data WR */ KASSERT(tx_credits >= 0 && tx_credits <= MAX_OFLD_TX_CREDITS, ("%s: %d credits", __func__, tx_credits)); if (tx_credits < MIN_TX_CREDITS(iso)) return (0); if (tx_credits >= (n * EQ_ESIZE) / 16) return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr) - iso_cpl_size); else return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr) - iso_cpl_size); } /* Maximum number of SGL entries we could stuff in a WR */ static inline int max_dsgl_nsegs(int tx_credits, int iso) { int nseg = 1; /* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */ int sge_pair_credits = tx_credits - MIN_TX_CREDITS(iso); KASSERT(tx_credits >= 0 && tx_credits <= MAX_OFLD_TX_CREDITS, ("%s: %d credits", __func__, tx_credits)); if (tx_credits < MIN_TX_CREDITS(iso)) return (0); nseg += 2 * (sge_pair_credits * 16 / 24); if ((sge_pair_credits * 16) % 24 == 16) nseg++; return (nseg); } static inline void write_tx_wr(void *dst, struct toepcb *toep, int fw_wr_opcode, unsigned int immdlen, unsigned int plen, uint8_t credits, int shove, int ulp_submode) { struct fw_ofld_tx_data_wr *txwr = dst; txwr->op_to_immdlen = htobe32(V_WR_OP(fw_wr_opcode) | V_FW_WR_IMMDLEN(immdlen)); txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) | V_FW_WR_LEN16(credits)); txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ulp_mode(toep)) | V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove)); txwr->plen = htobe32(plen); if (toep->params.tx_align > 0) { if (plen < 2 * toep->params.emss) txwr->lsodisable_to_flags |= htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE); else txwr->lsodisable_to_flags |= htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD | (toep->params.nagle == 0 ? 0 : F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE)); } } /* * Generate a DSGL from a starting mbuf. The total number of segments and the * maximum segments in any one mbuf are provided. */ static void write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n) { struct mbuf *m; struct ulptx_sgl *usgl = dst; int i, j, rc; struct sglist sg; struct sglist_seg segs[n]; KASSERT(nsegs > 0, ("%s: nsegs 0", __func__)); sglist_init(&sg, n, segs); usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | V_ULPTX_NSGE(nsegs)); i = -1; for (m = start; m != stop; m = m->m_next) { if (m->m_flags & M_EXTPG) rc = sglist_append_mbuf_epg(&sg, m, mtod(m, vm_offset_t), m->m_len); else rc = sglist_append(&sg, mtod(m, void *), m->m_len); if (__predict_false(rc != 0)) panic("%s: sglist_append %d", __func__, rc); for (j = 0; j < sg.sg_nseg; i++, j++) { if (i < 0) { usgl->len0 = htobe32(segs[j].ss_len); usgl->addr0 = htobe64(segs[j].ss_paddr); } else { usgl->sge[i / 2].len[i & 1] = htobe32(segs[j].ss_len); usgl->sge[i / 2].addr[i & 1] = htobe64(segs[j].ss_paddr); } #ifdef INVARIANTS nsegs--; #endif } sglist_reset(&sg); } if (i & 1) usgl->sge[i / 2].len[1] = htobe32(0); KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p", __func__, nsegs, start, stop)); } +bool +t4_push_raw_wr(struct adapter *sc, struct toepcb *toep, struct mbuf *m) +{ +#ifdef INVARIANTS + struct inpcb *inp = toep->inp; +#endif + struct wrqe *wr; + struct ofld_tx_sdesc *txsd; + u_int credits, plen; + + INP_WLOCK_ASSERT(inp); + MPASS(mbuf_raw_wr(m)); + plen = m->m_pkthdr.len; + credits = howmany(plen, 16); + if (credits > toep->tx_credits) + return (false); + + wr = alloc_wrqe(roundup2(plen, 16), &toep->ofld_txq->wrq); + if (wr == NULL) + return (false); + + m_copydata(m, 0, plen, wrtod(wr)); + m_freem(m); + + toep->tx_credits -= credits; + if (toep->tx_credits < MIN_OFLD_TX_CREDITS) + toep->flags |= TPF_TX_SUSPENDED; + + KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); + KASSERT(credits <= MAX_OFLD_TX_SDESC_CREDITS, + ("%s: tx_credits %u too large", __func__, credits)); + txsd = &toep->txsd[toep->txsd_pidx]; + txsd->plen = 0; + txsd->tx_credits = credits; + if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) + toep->txsd_pidx = 0; + toep->txsd_avail--; + + t4_wrq_tx(sc, wr); + return (true); +} + /* * Max number of SGL entries an offload tx work request can have. This is 41 * (1 + 40) for a full 512B work request. * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40) */ #define OFLD_SGL_LEN (41) /* * Send data and/or a FIN to the peer. * * The socket's so_snd buffer consists of a stream of data starting with sb_mb * and linked together with m_next. sb_sndptr, if set, is the last mbuf that * was transmitted. * * drop indicates the number of bytes that should be dropped from the head of * the send buffer. It is an optimization that lets do_fw4_ack avoid creating * contention on the send buffer lock (before this change it used to do * sowwakeup and then t4_push_frames right after that when recovering from tx * stalls). When drop is set this function MUST drop the bytes and wake up any * writers. */ static void t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop) { struct mbuf *sndptr, *m, *sb_sndptr; struct fw_ofld_tx_data_wr *txwr; struct wrqe *wr; u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; struct inpcb *inp = toep->inp; struct tcpcb *tp = intotcpcb(inp); struct socket *so = inp->inp_socket; struct sockbuf *sb = &so->so_snd; + struct mbufq *pduq = &toep->ulp_pduq; int tx_credits, shove, compl, sowwakeup; struct ofld_tx_sdesc *txsd; bool nomap_mbuf_seen; INP_WLOCK_ASSERT(inp); KASSERT(toep->flags & TPF_FLOWC_WR_SENT, ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); KASSERT(ulp_mode(toep) == ULP_MODE_NONE || ulp_mode(toep) == ULP_MODE_TCPDDP || ulp_mode(toep) == ULP_MODE_TLS || ulp_mode(toep) == ULP_MODE_RDMA, ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); #ifdef VERBOSE_TRACES CTR5(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d", __func__, toep->tid, toep->flags, tp->t_flags, drop); #endif if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) return; #ifdef RATELIMIT if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) && (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) { inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; } #endif /* * This function doesn't resume by itself. Someone else must clear the * flag and call this function. */ if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { KASSERT(drop == 0, ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); return; } txsd = &toep->txsd[toep->txsd_pidx]; do { tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); max_imm = max_imm_payload(tx_credits, 0); max_nsegs = max_dsgl_nsegs(tx_credits, 0); + if (__predict_false((sndptr = mbufq_first(pduq)) != NULL)) { + if (!t4_push_raw_wr(sc, toep, sndptr)) { + toep->flags |= TPF_TX_SUSPENDED; + return; + } + + m = mbufq_dequeue(pduq); + MPASS(m == sndptr); + + txsd = &toep->txsd[toep->txsd_pidx]; + continue; + } + SOCKBUF_LOCK(sb); sowwakeup = drop; if (drop) { sbdrop_locked(sb, drop); drop = 0; } sb_sndptr = sb->sb_sndptr; sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb; plen = 0; nsegs = 0; max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ nomap_mbuf_seen = false; for (m = sndptr; m != NULL; m = m->m_next) { int n; if ((m->m_flags & M_NOTREADY) != 0) break; if (plen + m->m_len > MAX_OFLD_TX_SDESC_PLEN) break; if (m->m_flags & M_EXTPG) { #ifdef KERN_TLS if (m->m_epg_tls != NULL) { toep->flags |= TPF_KTLS; if (plen == 0) { SOCKBUF_UNLOCK(sb); t4_push_ktls(sc, toep, 0); return; } break; } #endif n = sglist_count_mbuf_epg(m, mtod(m, vm_offset_t), m->m_len); } else n = sglist_count(mtod(m, void *), m->m_len); nsegs += n; plen += m->m_len; /* This mbuf sent us _over_ the nsegs limit, back out */ if (plen > max_imm && nsegs > max_nsegs) { nsegs -= n; plen -= m->m_len; if (plen == 0) { /* Too few credits */ toep->flags |= TPF_TX_SUSPENDED; if (sowwakeup) { if (!TAILQ_EMPTY( &toep->aiotx_jobq)) t4_aiotx_queue_toep(so, toep); sowwakeup_locked(so); } else SOCKBUF_UNLOCK(sb); SOCKBUF_UNLOCK_ASSERT(sb); return; } break; } if (m->m_flags & M_EXTPG) nomap_mbuf_seen = true; if (max_nsegs_1mbuf < n) max_nsegs_1mbuf = n; sb_sndptr = m; /* new sb->sb_sndptr if all goes well */ /* This mbuf put us right at the max_nsegs limit */ if (plen > max_imm && nsegs == max_nsegs) { m = m->m_next; break; } } if (sbused(sb) > sb->sb_hiwat * 5 / 8 && toep->plen_nocompl + plen >= sb->sb_hiwat / 4) compl = 1; else compl = 0; if (sb->sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf && sb->sb_hiwat < V_tcp_autosndbuf_max && sbused(sb) >= sb->sb_hiwat * 7 / 8) { int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc, V_tcp_autosndbuf_max); if (!sbreserve_locked(so, SO_SND, newsize, NULL)) sb->sb_flags &= ~SB_AUTOSIZE; else sowwakeup = 1; /* room available */ } if (sowwakeup) { if (!TAILQ_EMPTY(&toep->aiotx_jobq)) t4_aiotx_queue_toep(so, toep); sowwakeup_locked(so); } else SOCKBUF_UNLOCK(sb); SOCKBUF_UNLOCK_ASSERT(sb); /* nothing to send */ if (plen == 0) { KASSERT(m == NULL || (m->m_flags & M_NOTREADY) != 0, ("%s: nothing to send, but m != NULL is ready", __func__)); break; } if (__predict_false(toep->flags & TPF_FIN_SENT)) panic("%s: excess tx.", __func__); shove = m == NULL && !(tp->t_flags & TF_MORETOCOME); if (plen <= max_imm && !nomap_mbuf_seen) { /* Immediate data tx */ wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), &toep->ofld_txq->wrq); if (wr == NULL) { /* XXX: how will we recover from this? */ toep->flags |= TPF_TX_SUSPENDED; return; } txwr = wrtod(wr); credits = howmany(wr->wr_len, 16); write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, plen, plen, credits, shove, 0); m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); nsegs = 0; } else { int wr_len; /* DSGL tx */ wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; wr = alloc_wrqe(roundup2(wr_len, 16), &toep->ofld_txq->wrq); if (wr == NULL) { /* XXX: how will we recover from this? */ toep->flags |= TPF_TX_SUSPENDED; return; } txwr = wrtod(wr); credits = howmany(wr_len, 16); write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, 0, plen, credits, shove, 0); write_tx_sgl(txwr + 1, sndptr, m, nsegs, max_nsegs_1mbuf); if (wr_len & 0xf) { uint64_t *pad = (uint64_t *) ((uintptr_t)txwr + wr_len); *pad = 0; } } KASSERT(toep->tx_credits >= credits, ("%s: not enough credits", __func__)); toep->tx_credits -= credits; toep->tx_nocompl += credits; toep->plen_nocompl += plen; if (toep->tx_credits <= toep->tx_total * 3 / 8 && toep->tx_nocompl >= toep->tx_total / 4) compl = 1; if (compl || ulp_mode(toep) == ULP_MODE_RDMA) { txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); toep->tx_nocompl = 0; toep->plen_nocompl = 0; } tp->snd_nxt += plen; tp->snd_max += plen; SOCKBUF_LOCK(sb); KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__)); sb->sb_sndptr = sb_sndptr; SOCKBUF_UNLOCK(sb); toep->flags |= TPF_TX_DATA_SENT; if (toep->tx_credits < MIN_OFLD_TX_CREDITS) toep->flags |= TPF_TX_SUSPENDED; KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); KASSERT(plen <= MAX_OFLD_TX_SDESC_PLEN, ("%s: plen %u too large", __func__, plen)); txsd->plen = plen; txsd->tx_credits = credits; txsd++; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { toep->txsd_pidx = 0; txsd = &toep->txsd[0]; } toep->txsd_avail--; t4_l2t_send(sc, wr, toep->l2te); } while (m != NULL && (m->m_flags & M_NOTREADY) == 0); /* Send a FIN if requested, but only if there's no more data to send */ if (m == NULL && toep->flags & TPF_SEND_FIN) t4_close_conn(sc, toep); } static inline void rqdrop_locked(struct mbufq *q, int plen) { struct mbuf *m; while (plen > 0) { m = mbufq_dequeue(q); /* Too many credits. */ MPASS(m != NULL); M_ASSERTPKTHDR(m); /* Partial credits. */ MPASS(plen >= m->m_pkthdr.len); plen -= m->m_pkthdr.len; m_freem(m); } } /* * Not a bit in the TCB, but is a bit in the ulp_submode field of the * CPL_TX_DATA flags field in FW_ISCSI_TX_DATA_WR. */ #define ULP_ISO G_TX_ULP_SUBMODE(F_FW_ISCSI_TX_DATA_WR_ULPSUBMODE_ISO) static void write_tx_data_iso(void *dst, u_int ulp_submode, uint8_t flags, uint16_t mss, int len, int npdu) { struct cpl_tx_data_iso *cpl; unsigned int burst_size; unsigned int last; /* * The firmware will set the 'F' bit on the last PDU when * either condition is true: * * - this large PDU is marked as the "last" slice * * - the amount of data payload bytes equals the burst_size * * The strategy used here is to always set the burst_size * artificially high (len includes the size of the template * BHS) and only set the "last" flag if the original PDU had * 'F' set. */ burst_size = len; last = !!(flags & CXGBE_ISO_F); cpl = (struct cpl_tx_data_iso *)dst; cpl->op_to_scsi = htonl(V_CPL_TX_DATA_ISO_OP(CPL_TX_DATA_ISO) | V_CPL_TX_DATA_ISO_FIRST(1) | V_CPL_TX_DATA_ISO_LAST(last) | V_CPL_TX_DATA_ISO_CPLHDRLEN(0) | V_CPL_TX_DATA_ISO_HDRCRC(!!(ulp_submode & ULP_CRC_HEADER)) | V_CPL_TX_DATA_ISO_PLDCRC(!!(ulp_submode & ULP_CRC_DATA)) | V_CPL_TX_DATA_ISO_IMMEDIATE(0) | V_CPL_TX_DATA_ISO_SCSI(CXGBE_ISO_TYPE(flags))); cpl->ahs_len = 0; cpl->mpdu = htons(DIV_ROUND_UP(mss, 4)); cpl->burst_size = htonl(DIV_ROUND_UP(burst_size, 4)); cpl->len = htonl(len); cpl->reserved2_seglen_offset = htonl(0); cpl->datasn_offset = htonl(0); cpl->buffer_offset = htonl(0); cpl->reserved3 = 0; } static struct wrqe * write_iscsi_mbuf_wr(struct toepcb *toep, struct mbuf *sndptr) { struct mbuf *m; struct fw_ofld_tx_data_wr *txwr; struct cpl_tx_data_iso *cpl_iso; void *p; struct wrqe *wr; u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; u_int adjusted_plen, imm_data, ulp_submode; struct inpcb *inp = toep->inp; struct tcpcb *tp = intotcpcb(inp); int tx_credits, shove, npdu, wr_len; uint16_t iso_mss; static const u_int ulp_extra_len[] = {0, 4, 4, 8}; bool iso, nomap_mbuf_seen; M_ASSERTPKTHDR(sndptr); tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); if (mbuf_raw_wr(sndptr)) { plen = sndptr->m_pkthdr.len; KASSERT(plen <= SGE_MAX_WR_LEN, ("raw WR len %u is greater than max WR len", plen)); if (plen > tx_credits * 16) return (NULL); wr = alloc_wrqe(roundup2(plen, 16), &toep->ofld_txq->wrq); if (__predict_false(wr == NULL)) return (NULL); m_copydata(sndptr, 0, plen, wrtod(wr)); return (wr); } iso = mbuf_iscsi_iso(sndptr); max_imm = max_imm_payload(tx_credits, iso); max_nsegs = max_dsgl_nsegs(tx_credits, iso); iso_mss = mbuf_iscsi_iso_mss(sndptr); plen = 0; nsegs = 0; max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ nomap_mbuf_seen = false; for (m = sndptr; m != NULL; m = m->m_next) { int n; if (m->m_flags & M_EXTPG) n = sglist_count_mbuf_epg(m, mtod(m, vm_offset_t), m->m_len); else n = sglist_count(mtod(m, void *), m->m_len); nsegs += n; plen += m->m_len; /* * This mbuf would send us _over_ the nsegs limit. * Suspend tx because the PDU can't be sent out. */ if ((nomap_mbuf_seen || plen > max_imm) && nsegs > max_nsegs) return (NULL); if (m->m_flags & M_EXTPG) nomap_mbuf_seen = true; if (max_nsegs_1mbuf < n) max_nsegs_1mbuf = n; } if (__predict_false(toep->flags & TPF_FIN_SENT)) panic("%s: excess tx.", __func__); /* * We have a PDU to send. All of it goes out in one WR so 'm' * is NULL. A PDU's length is always a multiple of 4. */ MPASS(m == NULL); MPASS((plen & 3) == 0); MPASS(sndptr->m_pkthdr.len == plen); shove = !(tp->t_flags & TF_MORETOCOME); /* * plen doesn't include header and data digests, which are * generated and inserted in the right places by the TOE, but * they do occupy TCP sequence space and need to be accounted * for. */ ulp_submode = mbuf_ulp_submode(sndptr); MPASS(ulp_submode < nitems(ulp_extra_len)); npdu = iso ? howmany(plen - ISCSI_BHS_SIZE, iso_mss) : 1; adjusted_plen = plen + ulp_extra_len[ulp_submode] * npdu; if (iso) adjusted_plen += ISCSI_BHS_SIZE * (npdu - 1); wr_len = sizeof(*txwr); if (iso) wr_len += sizeof(struct cpl_tx_data_iso); if (plen <= max_imm && !nomap_mbuf_seen) { /* Immediate data tx */ imm_data = plen; wr_len += plen; nsegs = 0; } else { /* DSGL tx */ imm_data = 0; wr_len += sizeof(struct ulptx_sgl) + ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; } wr = alloc_wrqe(roundup2(wr_len, 16), &toep->ofld_txq->wrq); if (wr == NULL) { /* XXX: how will we recover from this? */ return (NULL); } txwr = wrtod(wr); credits = howmany(wr->wr_len, 16); if (iso) { write_tx_wr(txwr, toep, FW_ISCSI_TX_DATA_WR, imm_data + sizeof(struct cpl_tx_data_iso), adjusted_plen, credits, shove, ulp_submode | ULP_ISO); cpl_iso = (struct cpl_tx_data_iso *)(txwr + 1); MPASS(plen == sndptr->m_pkthdr.len); write_tx_data_iso(cpl_iso, ulp_submode, mbuf_iscsi_iso_flags(sndptr), iso_mss, plen, npdu); p = cpl_iso + 1; } else { write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, imm_data, adjusted_plen, credits, shove, ulp_submode); p = txwr + 1; } if (imm_data != 0) { m_copydata(sndptr, 0, plen, p); } else { write_tx_sgl(p, sndptr, m, nsegs, max_nsegs_1mbuf); if (wr_len & 0xf) { uint64_t *pad = (uint64_t *)((uintptr_t)txwr + wr_len); *pad = 0; } } KASSERT(toep->tx_credits >= credits, ("%s: not enough credits: credits %u " "toep->tx_credits %u tx_credits %u nsegs %u " "max_nsegs %u iso %d", __func__, credits, toep->tx_credits, tx_credits, nsegs, max_nsegs, iso)); tp->snd_nxt += adjusted_plen; tp->snd_max += adjusted_plen; counter_u64_add(toep->ofld_txq->tx_iscsi_pdus, npdu); counter_u64_add(toep->ofld_txq->tx_iscsi_octets, plen); if (iso) counter_u64_add(toep->ofld_txq->tx_iscsi_iso_wrs, 1); return (wr); } void t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop) { struct mbuf *sndptr, *m; struct fw_wr_hdr *wrhdr; struct wrqe *wr; u_int plen, credits; struct inpcb *inp = toep->inp; struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; struct mbufq *pduq = &toep->ulp_pduq; INP_WLOCK_ASSERT(inp); KASSERT(toep->flags & TPF_FLOWC_WR_SENT, ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); KASSERT(ulp_mode(toep) == ULP_MODE_ISCSI, ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) return; /* * This function doesn't resume by itself. Someone else must clear the * flag and call this function. */ if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { KASSERT(drop == 0, ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); return; } if (drop) { struct socket *so = inp->inp_socket; struct sockbuf *sb = &so->so_snd; int sbu; /* * An unlocked read is ok here as the data should only * transition from a non-zero value to either another * non-zero value or zero. Once it is zero it should * stay zero. */ if (__predict_false(sbused(sb)) > 0) { SOCKBUF_LOCK(sb); sbu = sbused(sb); if (sbu > 0) { /* * The data transmitted before the * tid's ULP mode changed to ISCSI is * still in so_snd. Incoming credits * should account for so_snd first. */ sbdrop_locked(sb, min(sbu, drop)); drop -= min(sbu, drop); } sowwakeup_locked(so); /* unlocks so_snd */ } rqdrop_locked(&toep->ulp_pdu_reclaimq, drop); } while ((sndptr = mbufq_first(pduq)) != NULL) { wr = write_iscsi_mbuf_wr(toep, sndptr); if (wr == NULL) { toep->flags |= TPF_TX_SUSPENDED; return; } plen = sndptr->m_pkthdr.len; credits = howmany(wr->wr_len, 16); KASSERT(toep->tx_credits >= credits, ("%s: not enough credits", __func__)); m = mbufq_dequeue(pduq); MPASS(m == sndptr); mbufq_enqueue(&toep->ulp_pdu_reclaimq, m); toep->tx_credits -= credits; toep->tx_nocompl += credits; toep->plen_nocompl += plen; /* * Ensure there are enough credits for a full-sized WR * as page pod WRs can be full-sized. */ if (toep->tx_credits <= SGE_MAX_WR_LEN * 5 / 4 && toep->tx_nocompl >= toep->tx_total / 4) { wrhdr = wrtod(wr); wrhdr->hi |= htobe32(F_FW_WR_COMPL); toep->tx_nocompl = 0; toep->plen_nocompl = 0; } toep->flags |= TPF_TX_DATA_SENT; if (toep->tx_credits < MIN_OFLD_TX_CREDITS) toep->flags |= TPF_TX_SUSPENDED; KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); KASSERT(plen <= MAX_OFLD_TX_SDESC_PLEN, ("%s: plen %u too large", __func__, plen)); txsd->plen = plen; txsd->tx_credits = credits; txsd++; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { toep->txsd_pidx = 0; txsd = &toep->txsd[0]; } toep->txsd_avail--; t4_l2t_send(sc, wr, toep->l2te); } /* Send a FIN if requested, but only if there are no more PDUs to send */ if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN) t4_close_conn(sc, toep); } static inline void t4_push_data(struct adapter *sc, struct toepcb *toep, int drop) { if (ulp_mode(toep) == ULP_MODE_ISCSI) t4_push_pdus(sc, toep, drop); else if (toep->flags & TPF_KTLS) t4_push_ktls(sc, toep, drop); else t4_push_frames(sc, toep, drop); } +void +t4_raw_wr_tx(struct adapter *sc, struct toepcb *toep, struct mbuf *m) +{ +#ifdef INVARIANTS + struct inpcb *inp = toep->inp; +#endif + + INP_WLOCK_ASSERT(inp); + + /* + * If there are other raw WRs enqueued, enqueue to preserve + * FIFO ordering. + */ + if (!mbufq_empty(&toep->ulp_pduq)) { + mbufq_enqueue(&toep->ulp_pduq, m); + return; + } + + /* + * Cannot call t4_push_data here as that will lock so_snd and + * some callers of this run in rx handlers with so_rcv locked. + * Instead, just try to transmit this WR. + */ + if (!t4_push_raw_wr(sc, toep, m)) { + mbufq_enqueue(&toep->ulp_pduq, m); + toep->flags |= TPF_TX_SUSPENDED; + } +} + int t4_tod_output(struct toedev *tod, struct tcpcb *tp) { struct adapter *sc = tod->tod_softc; #ifdef INVARIANTS struct inpcb *inp = tptoinpcb(tp); #endif struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(inp); KASSERT((inp->inp_flags & INP_DROPPED) == 0, ("%s: inp %p dropped.", __func__, inp)); KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); t4_push_data(sc, toep, 0); return (0); } int t4_send_fin(struct toedev *tod, struct tcpcb *tp) { struct adapter *sc = tod->tod_softc; #ifdef INVARIANTS struct inpcb *inp = tptoinpcb(tp); #endif struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(inp); KASSERT((inp->inp_flags & INP_DROPPED) == 0, ("%s: inp %p dropped.", __func__, inp)); KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); toep->flags |= TPF_SEND_FIN; if (tp->t_state >= TCPS_ESTABLISHED) t4_push_data(sc, toep, 0); return (0); } int t4_send_rst(struct toedev *tod, struct tcpcb *tp) { struct adapter *sc = tod->tod_softc; #if defined(INVARIANTS) struct inpcb *inp = tptoinpcb(tp); #endif struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(inp); KASSERT((inp->inp_flags & INP_DROPPED) == 0, ("%s: inp %p dropped.", __func__, inp)); KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); /* hmmmm */ KASSERT(toep->flags & TPF_FLOWC_WR_SENT, ("%s: flowc for tid %u [%s] not sent already", __func__, toep->tid, tcpstates[tp->t_state])); send_reset(sc, toep, 0); return (0); } /* * Peer has sent us a FIN. */ static int do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_peer_close *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp = toep->inp; struct tcpcb *tp = NULL; struct socket *so; struct epoch_tracker et; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_PEER_CLOSE, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); if (__predict_false(toep->flags & TPF_SYNQE)) { /* * do_pass_establish must have run before do_peer_close and if * this is still a synqe instead of a toepcb then the connection * must be getting aborted. */ MPASS(toep->flags & TPF_ABORT_SHUTDOWN); CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, toep, toep->flags); return (0); } KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); CURVNET_SET(toep->vnet); NET_EPOCH_ENTER(et); INP_WLOCK(inp); tp = intotcpcb(inp); CTR6(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x, ddp_flags 0x%x, inp %p", __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, toep->ddp.flags, inp); if (toep->flags & TPF_ABORT_SHUTDOWN) goto done; if (ulp_mode(toep) == ULP_MODE_TCPDDP) { DDP_LOCK(toep); if (__predict_false(toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE))) handle_ddp_close(toep, tp, cpl->rcv_nxt); DDP_UNLOCK(toep); } so = inp->inp_socket; socantrcvmore(so); if (ulp_mode(toep) == ULP_MODE_RDMA || (ulp_mode(toep) == ULP_MODE_ISCSI && chip_id(sc) >= CHELSIO_T6)) { /* * There might be data received via DDP before the FIN * not reported to the driver. Just assume the * sequence number in the CPL is correct as the * sequence number of the FIN. */ } else { KASSERT(tp->rcv_nxt + 1 == be32toh(cpl->rcv_nxt), ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt, be32toh(cpl->rcv_nxt))); } tp->rcv_nxt = be32toh(cpl->rcv_nxt); switch (tp->t_state) { case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; /* FALLTHROUGH */ case TCPS_ESTABLISHED: tcp_state_change(tp, TCPS_CLOSE_WAIT); break; case TCPS_FIN_WAIT_1: tcp_state_change(tp, TCPS_CLOSING); break; case TCPS_FIN_WAIT_2: restore_so_proto(so, inp->inp_vflag & INP_IPV6); t4_pcb_detach(NULL, tp); tcp_twstart(tp); INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ NET_EPOCH_EXIT(et); CURVNET_RESTORE(); INP_WLOCK(inp); final_cpl_received(toep); return (0); default: log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n", __func__, tid, tp->t_state); } done: INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } /* * Peer has ACK'd our FIN. */ static int do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp = toep->inp; struct tcpcb *tp = NULL; struct socket *so = NULL; struct epoch_tracker et; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_CLOSE_CON_RPL, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); CURVNET_SET(toep->vnet); NET_EPOCH_ENTER(et); INP_WLOCK(inp); tp = intotcpcb(inp); CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x", __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags); if (toep->flags & TPF_ABORT_SHUTDOWN) goto done; so = inp->inp_socket; tp->snd_una = be32toh(cpl->snd_nxt) - 1; /* exclude FIN */ switch (tp->t_state) { case TCPS_CLOSING: /* see TCPS_FIN_WAIT_2 in do_peer_close too */ restore_so_proto(so, inp->inp_vflag & INP_IPV6); t4_pcb_detach(NULL, tp); tcp_twstart(tp); release: INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ NET_EPOCH_EXIT(et); CURVNET_RESTORE(); INP_WLOCK(inp); final_cpl_received(toep); /* no more CPLs expected */ return (0); case TCPS_LAST_ACK: if (tcp_close(tp)) INP_WUNLOCK(inp); goto release; case TCPS_FIN_WAIT_1: if (so->so_rcv.sb_state & SBS_CANTRCVMORE) soisdisconnected(so); tcp_state_change(tp, TCPS_FIN_WAIT_2); break; default: log(LOG_ERR, "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n", __func__, tid, tcpstates[tp->t_state]); } done: INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } void send_abort_rpl(struct adapter *sc, struct sge_ofld_txq *ofld_txq, int tid, int rst_status) { struct wrqe *wr; struct cpl_abort_rpl *cpl; wr = alloc_wrqe(sizeof(*cpl), &ofld_txq->wrq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } cpl = wrtod(wr); INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid); cpl->cmd = rst_status; t4_wrq_tx(sc, wr); } static int abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason) { switch (abort_reason) { case CPL_ERR_BAD_SYN: case CPL_ERR_CONN_RESET: return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); case CPL_ERR_XMIT_TIMEDOUT: case CPL_ERR_PERSIST_TIMEDOUT: case CPL_ERR_FINWAIT2_TIMEDOUT: case CPL_ERR_KEEPALIVE_TIMEDOUT: return (ETIMEDOUT); default: return (EIO); } } /* * TCP RST from the peer, timeout, or some other such critical error. */ static int do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct sge_ofld_txq *ofld_txq = toep->ofld_txq; struct inpcb *inp; struct tcpcb *tp; struct epoch_tracker et; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_ABORT_REQ_RSS, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); if (toep->flags & TPF_SYNQE) return (do_abort_req_synqe(iq, rss, m)); KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); if (negative_advice(cpl->status)) { CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)", __func__, cpl->status, tid, toep->flags); return (0); /* Ignore negative advice */ } inp = toep->inp; CURVNET_SET(toep->vnet); NET_EPOCH_ENTER(et); /* for tcp_close */ INP_WLOCK(inp); tp = intotcpcb(inp); CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d", __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, inp->inp_flags, cpl->status); /* * If we'd initiated an abort earlier the reply to it is responsible for * cleaning up resources. Otherwise we tear everything down right here * right now. We owe the T4 a CPL_ABORT_RPL no matter what. */ if (toep->flags & TPF_ABORT_SHUTDOWN) { INP_WUNLOCK(inp); goto done; } toep->flags |= TPF_ABORT_SHUTDOWN; if ((inp->inp_flags & INP_DROPPED) == 0) { struct socket *so = inp->inp_socket; if (so != NULL) so_error_set(so, abort_status_to_errno(tp, cpl->status)); tp = tcp_close(tp); if (tp == NULL) INP_WLOCK(inp); /* re-acquire */ } final_cpl_received(toep); done: NET_EPOCH_EXIT(et); CURVNET_RESTORE(); send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); return (0); } /* * Reply to the CPL_ABORT_REQ (send_reset) */ static int do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp = toep->inp; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_ABORT_RPL_RSS, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); if (toep->flags & TPF_SYNQE) return (do_abort_rpl_synqe(iq, rss, m)); KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d", __func__, tid, toep, inp, cpl->status); KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, ("%s: wasn't expecting abort reply", __func__)); INP_WLOCK(inp); final_cpl_received(toep); return (0); } static int do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_rx_data *cpl = mtod(m, const void *); unsigned int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp = toep->inp; struct tcpcb *tp; struct socket *so; struct sockbuf *sb; struct epoch_tracker et; int len; uint32_t ddp_placed = 0; if (__predict_false(toep->flags & TPF_SYNQE)) { /* * do_pass_establish must have run before do_rx_data and if this * is still a synqe instead of a toepcb then the connection must * be getting aborted. */ MPASS(toep->flags & TPF_ABORT_SHUTDOWN); CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, toep, toep->flags); m_freem(m); return (0); } KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); /* strip off CPL header */ m_adj(m, sizeof(*cpl)); len = m->m_pkthdr.len; INP_WLOCK(inp); if (inp->inp_flags & INP_DROPPED) { CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", __func__, tid, len, inp->inp_flags); INP_WUNLOCK(inp); m_freem(m); return (0); } tp = intotcpcb(inp); if (__predict_false(ulp_mode(toep) == ULP_MODE_TLS && toep->flags & TPF_TLS_RECEIVE)) { /* Received "raw" data on a TLS socket. */ CTR3(KTR_CXGBE, "%s: tid %u, raw TLS data (%d bytes)", __func__, tid, len); do_rx_data_tls(cpl, toep, m); return (0); } if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq))) ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt; tp->rcv_nxt += len; if (tp->rcv_wnd < len) { KASSERT(ulp_mode(toep) == ULP_MODE_RDMA, ("%s: negative window size", __func__)); } tp->rcv_wnd -= len; tp->t_rcvtime = ticks; if (ulp_mode(toep) == ULP_MODE_TCPDDP) DDP_LOCK(toep); so = inp_inpcbtosocket(inp); sb = &so->so_rcv; SOCKBUF_LOCK(sb); if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) { CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)", __func__, tid, len); m_freem(m); SOCKBUF_UNLOCK(sb); if (ulp_mode(toep) == ULP_MODE_TCPDDP) DDP_UNLOCK(toep); INP_WUNLOCK(inp); CURVNET_SET(toep->vnet); NET_EPOCH_ENTER(et); INP_WLOCK(inp); tp = tcp_drop(tp, ECONNRESET); if (tp) INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } /* receive buffer autosize */ MPASS(toep->vnet == so->so_vnet); CURVNET_SET(toep->vnet); if (sb->sb_flags & SB_AUTOSIZE && V_tcp_do_autorcvbuf && sb->sb_hiwat < V_tcp_autorcvbuf_max && len > (sbspace(sb) / 8 * 7)) { unsigned int hiwat = sb->sb_hiwat; unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc, V_tcp_autorcvbuf_max); if (!sbreserve_locked(so, SO_RCV, newsize, NULL)) sb->sb_flags &= ~SB_AUTOSIZE; } if (ulp_mode(toep) == ULP_MODE_TCPDDP) { int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off; if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0) CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)", __func__, tid, len); if (changed) { if (toep->ddp.flags & DDP_SC_REQ) toep->ddp.flags ^= DDP_ON | DDP_SC_REQ; else if (cpl->ddp_off == 1) { /* Fell out of DDP mode */ toep->ddp.flags &= ~DDP_ON; CTR1(KTR_CXGBE, "%s: fell out of DDP mode", __func__); insert_ddp_data(toep, ddp_placed); } else { /* * Data was received while still * ULP_MODE_NONE, just fall through. */ } } if (toep->ddp.flags & DDP_ON) { /* * CPL_RX_DATA with DDP on can only be an indicate. * Start posting queued AIO requests via DDP. The * payload that arrived in this indicate is appended * to the socket buffer as usual. */ handle_ddp_indicate(toep); } } sbappendstream_locked(sb, m, 0); t4_rcvd_locked(&toep->td->tod, tp); if (ulp_mode(toep) == ULP_MODE_TCPDDP && (toep->ddp.flags & DDP_AIO) != 0 && toep->ddp.waiting_count > 0 && sbavail(sb) != 0) { CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__, tid); ddp_queue_toep(toep); } if (toep->flags & TPF_TLS_STARTING) tls_received_starting_data(sc, toep, sb, len); sorwakeup_locked(so); SOCKBUF_UNLOCK_ASSERT(sb); if (ulp_mode(toep) == ULP_MODE_TCPDDP) DDP_UNLOCK(toep); INP_WUNLOCK(inp); CURVNET_RESTORE(); return (0); } static int do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp; struct tcpcb *tp; struct socket *so; uint8_t credits = cpl->credits; struct ofld_tx_sdesc *txsd; int plen; #ifdef INVARIANTS unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl))); #endif /* * Very unusual case: we'd sent a flowc + abort_req for a synq entry and * now this comes back carrying the credits for the flowc. */ if (__predict_false(toep->flags & TPF_SYNQE)) { KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, ("%s: credits for a synq entry %p", __func__, toep)); return (0); } inp = toep->inp; KASSERT(opcode == CPL_FW4_ACK, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); INP_WLOCK(inp); if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) { INP_WUNLOCK(inp); return (0); } KASSERT((inp->inp_flags & INP_DROPPED) == 0, ("%s: inp_flags 0x%x", __func__, inp->inp_flags)); tp = intotcpcb(inp); if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) { tcp_seq snd_una = be32toh(cpl->snd_una); #ifdef INVARIANTS if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { log(LOG_ERR, "%s: unexpected seq# %x for TID %u, snd_una %x\n", __func__, snd_una, toep->tid, tp->snd_una); } #endif if (tp->snd_una != snd_una) { tp->snd_una = snd_una; tp->ts_recent_age = tcp_ts_getticks(); } } #ifdef VERBOSE_TRACES CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits); #endif so = inp->inp_socket; txsd = &toep->txsd[toep->txsd_cidx]; plen = 0; while (credits) { KASSERT(credits >= txsd->tx_credits, ("%s: too many (or partial) credits", __func__)); credits -= txsd->tx_credits; toep->tx_credits += txsd->tx_credits; plen += txsd->plen; txsd++; toep->txsd_avail++; KASSERT(toep->txsd_avail <= toep->txsd_total, ("%s: txsd avail > total", __func__)); if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) { txsd = &toep->txsd[0]; toep->txsd_cidx = 0; } } if (toep->tx_credits == toep->tx_total) { toep->tx_nocompl = 0; toep->plen_nocompl = 0; } if (toep->flags & TPF_TX_SUSPENDED && toep->tx_credits >= toep->tx_total / 4) { #ifdef VERBOSE_TRACES CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__, tid); #endif toep->flags &= ~TPF_TX_SUSPENDED; CURVNET_SET(toep->vnet); t4_push_data(sc, toep, plen); CURVNET_RESTORE(); } else if (plen > 0) { struct sockbuf *sb = &so->so_snd; int sbu; SOCKBUF_LOCK(sb); sbu = sbused(sb); if (ulp_mode(toep) == ULP_MODE_ISCSI) { if (__predict_false(sbu > 0)) { /* * The data transmitted before the * tid's ULP mode changed to ISCSI is * still in so_snd. Incoming credits * should account for so_snd first. */ sbdrop_locked(sb, min(sbu, plen)); plen -= min(sbu, plen); } sowwakeup_locked(so); /* unlocks so_snd */ rqdrop_locked(&toep->ulp_pdu_reclaimq, plen); } else { #ifdef VERBOSE_TRACES CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__, tid, plen); #endif sbdrop_locked(sb, plen); if (!TAILQ_EMPTY(&toep->aiotx_jobq)) t4_aiotx_queue_toep(so, toep); sowwakeup_locked(so); /* unlocks so_snd */ } SOCKBUF_UNLOCK_ASSERT(sb); } INP_WUNLOCK(inp); return (0); } void t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie) { struct wrqe *wr; struct cpl_set_tcb_field *req; struct ofld_tx_sdesc *txsd; MPASS((cookie & ~M_COOKIE) == 0); if (reply) { MPASS(cookie != CPL_COOKIE_RESERVED); } wr = alloc_wrqe(sizeof(*req), wrq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } req = wrtod(wr); INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid); req->reply_ctrl = htobe16(V_QUEUENO(toep->ofld_rxq->iq.abs_id)); if (reply == 0) req->reply_ctrl |= htobe16(F_NO_REPLY); req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie)); req->mask = htobe64(mask); req->val = htobe64(val); if (wrq->eq.type == EQ_OFLD) { txsd = &toep->txsd[toep->txsd_pidx]; _Static_assert(howmany(sizeof(*req), 16) <= MAX_OFLD_TX_SDESC_CREDITS, "MAX_OFLD_TX_SDESC_CREDITS too small"); txsd->tx_credits = howmany(sizeof(*req), 16); txsd->plen = 0; KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, ("%s: not enough credits (%d)", __func__, toep->tx_credits)); toep->tx_credits -= txsd->tx_credits; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) toep->txsd_pidx = 0; toep->txsd_avail--; } t4_wrq_tx(sc, wr); } void t4_init_cpl_io_handlers(void) { t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl, CPL_COOKIE_TOM); t4_register_cpl_handler(CPL_RX_DATA, do_rx_data); t4_register_shared_cpl_handler(CPL_FW4_ACK, do_fw4_ack, CPL_COOKIE_TOM); } void t4_uninit_cpl_io_handlers(void) { t4_register_cpl_handler(CPL_PEER_CLOSE, NULL); t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL); t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL); t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, NULL, CPL_COOKIE_TOM); t4_register_cpl_handler(CPL_RX_DATA, NULL); t4_register_shared_cpl_handler(CPL_FW4_ACK, NULL, CPL_COOKIE_TOM); } /* * Use the 'backend1' field in AIO jobs to hold an error that should * be reported when the job is completed, the 'backend3' field to * store the amount of data sent by the AIO job so far, and the * 'backend4' field to hold a reference count on the job. * * Each unmapped mbuf holds a reference on the job as does the queue * so long as the job is queued. */ #define aio_error backend1 #define aio_sent backend3 #define aio_refs backend4 #ifdef VERBOSE_TRACES static int jobtotid(struct kaiocb *job) { struct socket *so; struct tcpcb *tp; struct toepcb *toep; so = job->fd_file->f_data; tp = sototcpcb(so); toep = tp->t_toe; return (toep->tid); } #endif static void aiotx_free_job(struct kaiocb *job) { long status; int error; if (refcount_release(&job->aio_refs) == 0) return; error = (intptr_t)job->aio_error; status = job->aio_sent; #ifdef VERBOSE_TRACES CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__, jobtotid(job), job, status, error); #endif if (error != 0 && status != 0) error = 0; if (error == ECANCELED) aio_cancel(job); else if (error) aio_complete(job, -1, error); else { job->msgsnd = 1; aio_complete(job, status, 0); } } static void aiotx_free_pgs(struct mbuf *m) { struct kaiocb *job; vm_page_t pg; M_ASSERTEXTPG(m); job = m->m_ext.ext_arg1; #ifdef VERBOSE_TRACES CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__, m->m_len, jobtotid(job)); #endif for (int i = 0; i < m->m_epg_npgs; i++) { pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); vm_page_unwire(pg, PQ_ACTIVE); } aiotx_free_job(job); } /* * Allocate a chain of unmapped mbufs describing the next 'len' bytes * of an AIO job. */ static struct mbuf * alloc_aiotx_mbuf(struct kaiocb *job, int len) { struct vmspace *vm; vm_page_t pgs[MBUF_PEXT_MAX_PGS]; struct mbuf *m, *top, *last; vm_map_t map; vm_offset_t start; int i, mlen, npages, pgoff; KASSERT(job->aio_sent + len <= job->uaiocb.aio_nbytes, ("%s(%p, %d): request to send beyond end of buffer", __func__, job, len)); /* * The AIO subsystem will cancel and drain all requests before * permitting a process to exit or exec, so p_vmspace should * be stable here. */ vm = job->userproc->p_vmspace; map = &vm->vm_map; start = (uintptr_t)job->uaiocb.aio_buf + job->aio_sent; pgoff = start & PAGE_MASK; top = NULL; last = NULL; while (len > 0) { mlen = imin(len, MBUF_PEXT_MAX_PGS * PAGE_SIZE - pgoff); KASSERT(mlen == len || ((start + mlen) & PAGE_MASK) == 0, ("%s: next start (%#jx + %#x) is not page aligned", __func__, (uintmax_t)start, mlen)); npages = vm_fault_quick_hold_pages(map, start, mlen, VM_PROT_WRITE, pgs, nitems(pgs)); if (npages < 0) break; m = mb_alloc_ext_pgs(M_WAITOK, aiotx_free_pgs, M_RDONLY); m->m_epg_1st_off = pgoff; m->m_epg_npgs = npages; if (npages == 1) { KASSERT(mlen + pgoff <= PAGE_SIZE, ("%s: single page is too large (off %d len %d)", __func__, pgoff, mlen)); m->m_epg_last_len = mlen; } else { m->m_epg_last_len = mlen - (PAGE_SIZE - pgoff) - (npages - 2) * PAGE_SIZE; } for (i = 0; i < npages; i++) m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pgs[i]); m->m_len = mlen; m->m_ext.ext_size = npages * PAGE_SIZE; m->m_ext.ext_arg1 = job; refcount_acquire(&job->aio_refs); #ifdef VERBOSE_TRACES CTR5(KTR_CXGBE, "%s: tid %d, new mbuf %p for job %p, npages %d", __func__, jobtotid(job), m, job, npages); #endif if (top == NULL) top = m; else last->m_next = m; last = m; len -= mlen; start += mlen; pgoff = 0; } return (top); } static void t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job) { struct sockbuf *sb; struct inpcb *inp; struct tcpcb *tp; struct mbuf *m; u_int sent; int error, len; bool moretocome, sendmore; sb = &so->so_snd; SOCKBUF_UNLOCK(sb); m = NULL; #ifdef MAC error = mac_socket_check_send(job->fd_file->f_cred, so); if (error != 0) goto out; #endif /* Inline sosend_generic(). */ error = SOCK_IO_SEND_LOCK(so, SBL_WAIT); MPASS(error == 0); sendanother: SOCKBUF_LOCK(sb); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(sb); SOCK_IO_SEND_UNLOCK(so); if ((so->so_options & SO_NOSIGPIPE) == 0) { PROC_LOCK(job->userproc); kern_psignal(job->userproc, SIGPIPE); PROC_UNLOCK(job->userproc); } error = EPIPE; goto out; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(sb); SOCK_IO_SEND_UNLOCK(so); goto out; } if ((so->so_state & SS_ISCONNECTED) == 0) { SOCKBUF_UNLOCK(sb); SOCK_IO_SEND_UNLOCK(so); error = ENOTCONN; goto out; } if (sbspace(sb) < sb->sb_lowat) { MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO)); /* * Don't block if there is too little room in the socket * buffer. Instead, requeue the request. */ if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { SOCKBUF_UNLOCK(sb); SOCK_IO_SEND_UNLOCK(so); error = ECANCELED; goto out; } TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); SOCKBUF_UNLOCK(sb); SOCK_IO_SEND_UNLOCK(so); goto out; } /* * Write as much data as the socket permits, but no more than a * a single sndbuf at a time. */ len = sbspace(sb); if (len > job->uaiocb.aio_nbytes - job->aio_sent) { len = job->uaiocb.aio_nbytes - job->aio_sent; moretocome = false; } else moretocome = true; if (len > toep->params.sndbuf) { len = toep->params.sndbuf; sendmore = true; } else sendmore = false; if (!TAILQ_EMPTY(&toep->aiotx_jobq)) moretocome = true; SOCKBUF_UNLOCK(sb); MPASS(len != 0); m = alloc_aiotx_mbuf(job, len); if (m == NULL) { SOCK_IO_SEND_UNLOCK(so); error = EFAULT; goto out; } /* Inlined tcp_usr_send(). */ inp = toep->inp; INP_WLOCK(inp); if (inp->inp_flags & INP_DROPPED) { INP_WUNLOCK(inp); SOCK_IO_SEND_UNLOCK(so); error = ECONNRESET; goto out; } sent = m_length(m, NULL); job->aio_sent += sent; counter_u64_add(toep->ofld_txq->tx_aio_octets, sent); sbappendstream(sb, m, 0); m = NULL; if (!(inp->inp_flags & INP_DROPPED)) { tp = intotcpcb(inp); if (moretocome) tp->t_flags |= TF_MORETOCOME; error = tcp_output(tp); if (error < 0) { INP_UNLOCK_ASSERT(inp); SOCK_IO_SEND_UNLOCK(so); error = -error; goto out; } if (moretocome) tp->t_flags &= ~TF_MORETOCOME; } INP_WUNLOCK(inp); if (sendmore) goto sendanother; SOCK_IO_SEND_UNLOCK(so); if (error) goto out; /* * If this is a blocking socket and the request has not been * fully completed, requeue it until the socket is ready * again. */ if (job->aio_sent < job->uaiocb.aio_nbytes && !(so->so_state & SS_NBIO)) { SOCKBUF_LOCK(sb); if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { SOCKBUF_UNLOCK(sb); error = ECANCELED; goto out; } TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); return; } /* * If the request will not be requeued, drop the queue's * reference to the job. Any mbufs in flight should still * hold a reference, but this drops the reference that the * queue owns while it is waiting to queue mbufs to the * socket. */ aiotx_free_job(job); counter_u64_add(toep->ofld_txq->tx_aio_jobs, 1); out: if (error) { job->aio_error = (void *)(intptr_t)error; aiotx_free_job(job); } m_freem(m); SOCKBUF_LOCK(sb); } static void t4_aiotx_task(void *context, int pending) { struct toepcb *toep = context; struct socket *so; struct kaiocb *job; struct epoch_tracker et; so = toep->aiotx_so; CURVNET_SET(toep->vnet); NET_EPOCH_ENTER(et); SOCKBUF_LOCK(&so->so_snd); while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) { job = TAILQ_FIRST(&toep->aiotx_jobq); TAILQ_REMOVE(&toep->aiotx_jobq, job, list); if (!aio_clear_cancel_function(job)) continue; t4_aiotx_process_job(toep, so, job); } toep->aiotx_so = NULL; SOCKBUF_UNLOCK(&so->so_snd); NET_EPOCH_EXIT(et); free_toepcb(toep); sorele(so); CURVNET_RESTORE(); } static void t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep) { SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd); #ifdef VERBOSE_TRACES CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s", __func__, toep->tid, toep->aiotx_so != NULL ? "true" : "false"); #endif if (toep->aiotx_so != NULL) return; soref(so); toep->aiotx_so = so; hold_toepcb(toep); soaio_enqueue(&toep->aiotx_task); } static void t4_aiotx_cancel(struct kaiocb *job) { struct socket *so; struct sockbuf *sb; struct tcpcb *tp; struct toepcb *toep; so = job->fd_file->f_data; tp = sototcpcb(so); toep = tp->t_toe; MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE); sb = &so->so_snd; SOCKBUF_LOCK(sb); if (!aio_cancel_cleared(job)) TAILQ_REMOVE(&toep->aiotx_jobq, job, list); SOCKBUF_UNLOCK(sb); job->aio_error = (void *)(intptr_t)ECANCELED; aiotx_free_job(job); } int t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job) { struct tcpcb *tp = sototcpcb(so); struct toepcb *toep = tp->t_toe; struct adapter *sc = td_adapter(toep->td); /* This only handles writes. */ if (job->uaiocb.aio_lio_opcode != LIO_WRITE) return (EOPNOTSUPP); if (!sc->tt.tx_zcopy) return (EOPNOTSUPP); if (tls_tx_key(toep)) return (EOPNOTSUPP); SOCKBUF_LOCK(&so->so_snd); #ifdef VERBOSE_TRACES CTR3(KTR_CXGBE, "%s: queueing %p for tid %u", __func__, job, toep->tid); #endif if (!aio_set_cancel_function(job, t4_aiotx_cancel)) panic("new job was cancelled"); refcount_init(&job->aio_refs, 1); TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list); if (sowriteable(so)) t4_aiotx_queue_toep(so, toep); SOCKBUF_UNLOCK(&so->so_snd); return (0); } void aiotx_init_toep(struct toepcb *toep) { TAILQ_INIT(&toep->aiotx_jobq); TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep); } #endif diff --git a/sys/dev/cxgbe/tom/t4_tls.c b/sys/dev/cxgbe/tom/t4_tls.c index 857832aafa5c..3dd3c28e3a86 100644 --- a/sys/dev/cxgbe/tom/t4_tls.c +++ b/sys/dev/cxgbe/tom/t4_tls.c @@ -1,1301 +1,1314 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2017-2018 Chelsio Communications, Inc. * All rights reserved. * Written by: John Baldwin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_inet.h" #include "opt_kern_tls.h" #include #ifdef KERN_TLS #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCP_OFFLOAD #include "common/common.h" #include "common/t4_tcb.h" #include "crypto/t4_crypto.h" #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" /* * The TCP sequence number of a CPL_TLS_DATA mbuf is saved here while * the mbuf is in the ulp_pdu_reclaimq. */ #define tls_tcp_seq PH_loc.thirtytwo[0] static void t4_set_tls_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val) { struct adapter *sc = td_adapter(toep->td); t4_set_tcb_field(sc, &toep->ofld_txq->wrq, toep, word, mask, val, 0, 0); } /* TLS and DTLS common routines */ bool can_tls_offload(struct adapter *sc) { return (sc->tt.tls && sc->cryptocaps & FW_CAPS_CONFIG_TLSKEYS); } int tls_tx_key(struct toepcb *toep) { struct tls_ofld_info *tls_ofld = &toep->tls; return (tls_ofld->tx_key_addr >= 0); } /* Set TF_RX_QUIESCE to pause receive. */ static void t4_set_rx_quiesce(struct toepcb *toep) { struct adapter *sc = td_adapter(toep->td); t4_set_tcb_field(sc, &toep->ofld_txq->wrq, toep, W_TCB_T_FLAGS, V_TF_RX_QUIESCE(1), V_TF_RX_QUIESCE(1), 1, CPL_COOKIE_TOM); } /* Clear TF_RX_QUIESCE to re-enable receive. */ static void t4_clear_rx_quiesce(struct toepcb *toep) { t4_set_tls_tcb_field(toep, W_TCB_T_FLAGS, V_TF_RX_QUIESCE(1), 0); } /* TLS/DTLS content type for CPL SFO */ static inline unsigned char tls_content_type(unsigned char content_type) { switch (content_type) { case CONTENT_TYPE_CCS: return CPL_TX_TLS_SFO_TYPE_CCS; case CONTENT_TYPE_ALERT: return CPL_TX_TLS_SFO_TYPE_ALERT; case CONTENT_TYPE_HANDSHAKE: return CPL_TX_TLS_SFO_TYPE_HANDSHAKE; case CONTENT_TYPE_APP_DATA: return CPL_TX_TLS_SFO_TYPE_DATA; default: return CPL_TX_TLS_SFO_TYPE_CUSTOM; } } /* TLS Key memory management */ static void clear_tls_keyid(struct toepcb *toep) { struct tls_ofld_info *tls_ofld = &toep->tls; struct adapter *sc = td_adapter(toep->td); if (tls_ofld->rx_key_addr >= 0) { t4_free_tls_keyid(sc, tls_ofld->rx_key_addr); tls_ofld->rx_key_addr = -1; } if (tls_ofld->tx_key_addr >= 0) { t4_free_tls_keyid(sc, tls_ofld->tx_key_addr); tls_ofld->tx_key_addr = -1; } } static int get_tp_plen_max(struct ktls_session *tls) { int plen = ((min(3*4096, TP_TX_PG_SZ))/1448) * 1448; return (tls->params.max_frame_len <= 8192 ? plen : FC_TP_PLEN_MAX); } /* Send request to get the key-id */ static int tls_program_key_id(struct toepcb *toep, struct ktls_session *tls, int direction) { struct tls_ofld_info *tls_ofld = &toep->tls; struct adapter *sc = td_adapter(toep->td); struct ofld_tx_sdesc *txsd; int keyid; struct wrqe *wr; struct tls_key_req *kwr; struct tls_keyctx *kctx; #ifdef INVARIANTS int kwrlen, kctxlen, len; kwrlen = sizeof(*kwr); kctxlen = roundup2(sizeof(*kctx), 32); len = roundup2(kwrlen + kctxlen, 16); MPASS(TLS_KEY_WR_SZ == len); #endif if (toep->txsd_avail == 0) return (EAGAIN); if ((keyid = t4_alloc_tls_keyid(sc)) < 0) { return (ENOSPC); } wr = alloc_wrqe(TLS_KEY_WR_SZ, &toep->ofld_txq->wrq); if (wr == NULL) { t4_free_tls_keyid(sc, keyid); return (ENOMEM); } kwr = wrtod(wr); memset(kwr, 0, TLS_KEY_WR_SZ); t4_write_tlskey_wr(tls, direction, toep->tid, F_FW_WR_COMPL, keyid, kwr); kctx = (struct tls_keyctx *)(kwr + 1); if (direction == KTLS_TX) tls_ofld->tx_key_addr = keyid; else tls_ofld->rx_key_addr = keyid; t4_tls_key_ctx(tls, direction, kctx); txsd = &toep->txsd[toep->txsd_pidx]; _Static_assert(DIV_ROUND_UP(TLS_KEY_WR_SZ, 16) <= MAX_OFLD_TX_SDESC_CREDITS, "MAX_OFLD_TX_SDESC_CREDITS too small"); txsd->tx_credits = DIV_ROUND_UP(TLS_KEY_WR_SZ, 16); txsd->plen = 0; toep->tx_credits -= txsd->tx_credits; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) toep->txsd_pidx = 0; toep->txsd_avail--; t4_wrq_tx(sc, wr); return (0); } int tls_alloc_ktls(struct toepcb *toep, struct ktls_session *tls, int direction) { struct adapter *sc = td_adapter(toep->td); int error, explicit_iv_size, mac_first; if (!can_tls_offload(sc)) return (EINVAL); if (direction == KTLS_RX) { if (ulp_mode(toep) != ULP_MODE_NONE) return (EINVAL); if ((toep->flags & TPF_TLS_STARTING) != 0) return (EINVAL); } else { switch (ulp_mode(toep)) { case ULP_MODE_NONE: case ULP_MODE_TLS: case ULP_MODE_TCPDDP: break; default: return (EINVAL); } } switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: /* XXX: Explicitly ignore any provided IV. */ switch (tls->params.cipher_key_len) { case 128 / 8: case 192 / 8: case 256 / 8: break; default: return (EINVAL); } switch (tls->params.auth_algorithm) { case CRYPTO_SHA1_HMAC: case CRYPTO_SHA2_256_HMAC: case CRYPTO_SHA2_384_HMAC: break; default: return (EPROTONOSUPPORT); } explicit_iv_size = AES_BLOCK_LEN; mac_first = 1; break; case CRYPTO_AES_NIST_GCM_16: if (tls->params.iv_len != SALT_SIZE) { return (EINVAL); } switch (tls->params.cipher_key_len) { case 128 / 8: case 192 / 8: case 256 / 8: break; default: return (EINVAL); } explicit_iv_size = 8; mac_first = 0; break; default: return (EPROTONOSUPPORT); } /* Only TLS 1.1 and TLS 1.2 are currently supported. */ if (tls->params.tls_vmajor != TLS_MAJOR_VER_ONE || tls->params.tls_vminor < TLS_MINOR_VER_ONE || tls->params.tls_vminor > TLS_MINOR_VER_TWO) { return (EPROTONOSUPPORT); } /* Bail if we already have a key. */ if (direction == KTLS_TX) { if (toep->tls.tx_key_addr != -1) return (EOPNOTSUPP); } else { if (toep->tls.rx_key_addr != -1) return (EOPNOTSUPP); } error = tls_program_key_id(toep, tls, direction); if (error) return (error); if (direction == KTLS_TX) { toep->tls.scmd0.seqno_numivs = (V_SCMD_SEQ_NO_CTRL(3) | V_SCMD_PROTO_VERSION(t4_tls_proto_ver(tls)) | V_SCMD_ENC_DEC_CTRL(SCMD_ENCDECCTRL_ENCRYPT) | V_SCMD_CIPH_AUTH_SEQ_CTRL((mac_first == 0)) | V_SCMD_CIPH_MODE(t4_tls_cipher_mode(tls)) | V_SCMD_AUTH_MODE(t4_tls_auth_mode(tls)) | V_SCMD_HMAC_CTRL(t4_tls_hmac_ctrl(tls)) | V_SCMD_IV_SIZE(explicit_iv_size / 2)); toep->tls.scmd0.ivgen_hdrlen = (V_SCMD_IV_GEN_CTRL(1) | V_SCMD_KEY_CTX_INLINE(0) | V_SCMD_TLS_FRAG_ENABLE(1)); toep->tls.iv_len = explicit_iv_size; toep->tls.frag_size = tls->params.max_frame_len; toep->tls.fcplenmax = get_tp_plen_max(tls); toep->tls.expn_per_ulp = tls->params.tls_hlen + tls->params.tls_tlen; toep->tls.pdus_per_ulp = 1; toep->tls.adjusted_plen = toep->tls.expn_per_ulp + tls->params.max_frame_len; toep->tls.tx_key_info_size = t4_tls_key_info_size(tls); } else { toep->flags |= TPF_TLS_STARTING | TPF_TLS_RX_QUIESCING; toep->tls.rx_version = tls->params.tls_vmajor << 8 | tls->params.tls_vminor; CTR2(KTR_CXGBE, "%s: tid %d setting RX_QUIESCE", __func__, toep->tid); t4_set_rx_quiesce(toep); } return (0); } void tls_init_toep(struct toepcb *toep) { struct tls_ofld_info *tls_ofld = &toep->tls; tls_ofld->rx_key_addr = -1; tls_ofld->tx_key_addr = -1; } void tls_uninit_toep(struct toepcb *toep) { clear_tls_keyid(toep); } #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16) #define MIN_OFLD_TLSTX_CREDITS(toep) \ (howmany(sizeof(struct fw_tlstx_data_wr) + \ sizeof(struct cpl_tx_tls_sfo) + sizeof(struct ulptx_idata) + \ sizeof(struct ulptx_sc_memrd) + \ AES_BLOCK_LEN + 1, 16)) static void write_tlstx_wr(struct fw_tlstx_data_wr *txwr, struct toepcb *toep, unsigned int plen, unsigned int expn, uint8_t credits, int shove) { struct tls_ofld_info *tls_ofld = &toep->tls; unsigned int len = plen + expn; txwr->op_to_immdlen = htobe32(V_WR_OP(FW_TLSTX_DATA_WR) | V_FW_TLSTX_DATA_WR_COMPL(1) | V_FW_TLSTX_DATA_WR_IMMDLEN(0)); txwr->flowid_len16 = htobe32(V_FW_TLSTX_DATA_WR_FLOWID(toep->tid) | V_FW_TLSTX_DATA_WR_LEN16(credits)); txwr->plen = htobe32(len); txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ULP_MODE_TLS) | V_TX_URG(0) | /* F_T6_TX_FORCE | */ V_TX_SHOVE(shove)); txwr->ctxloc_to_exp = htobe32(V_FW_TLSTX_DATA_WR_NUMIVS(1) | V_FW_TLSTX_DATA_WR_EXP(expn) | V_FW_TLSTX_DATA_WR_CTXLOC(TLS_SFO_WR_CONTEXTLOC_DDR) | V_FW_TLSTX_DATA_WR_IVDSGL(0) | V_FW_TLSTX_DATA_WR_KEYSIZE(tls_ofld->tx_key_info_size >> 4)); txwr->mfs = htobe16(tls_ofld->frag_size); txwr->adjustedplen_pkd = htobe16( V_FW_TLSTX_DATA_WR_ADJUSTEDPLEN(tls_ofld->adjusted_plen)); txwr->expinplenmax_pkd = htobe16( V_FW_TLSTX_DATA_WR_EXPINPLENMAX(tls_ofld->expn_per_ulp)); txwr->pdusinplenmax_pkd = V_FW_TLSTX_DATA_WR_PDUSINPLENMAX(tls_ofld->pdus_per_ulp); } static void write_tlstx_cpl(struct cpl_tx_tls_sfo *cpl, struct toepcb *toep, struct tls_hdr *tls_hdr, unsigned int plen, uint64_t seqno) { struct tls_ofld_info *tls_ofld = &toep->tls; int data_type, seglen; seglen = plen; data_type = tls_content_type(tls_hdr->type); cpl->op_to_seg_len = htobe32(V_CPL_TX_TLS_SFO_OPCODE(CPL_TX_TLS_SFO) | V_CPL_TX_TLS_SFO_DATA_TYPE(data_type) | V_CPL_TX_TLS_SFO_CPL_LEN(2) | V_CPL_TX_TLS_SFO_SEG_LEN(seglen)); cpl->pld_len = htobe32(plen); if (data_type == CPL_TX_TLS_SFO_TYPE_CUSTOM) cpl->type_protover = htobe32( V_CPL_TX_TLS_SFO_TYPE(tls_hdr->type)); cpl->seqno_numivs = htobe32(tls_ofld->scmd0.seqno_numivs | V_SCMD_NUM_IVS(1)); cpl->ivgen_hdrlen = htobe32(tls_ofld->scmd0.ivgen_hdrlen); cpl->scmd1 = htobe64(seqno); } static int count_ext_pgs_segs(struct mbuf *m) { vm_paddr_t nextpa; u_int i, nsegs; MPASS(m->m_epg_npgs > 0); nsegs = 1; nextpa = m->m_epg_pa[0] + PAGE_SIZE; for (i = 1; i < m->m_epg_npgs; i++) { if (nextpa != m->m_epg_pa[i]) nsegs++; nextpa = m->m_epg_pa[i] + PAGE_SIZE; } return (nsegs); } static void write_ktlstx_sgl(void *dst, struct mbuf *m, int nsegs) { struct ulptx_sgl *usgl = dst; vm_paddr_t pa; uint32_t len; int i, j; KASSERT(nsegs > 0, ("%s: nsegs 0", __func__)); usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | V_ULPTX_NSGE(nsegs)); /* Figure out the first S/G length. */ pa = m->m_epg_pa[0] + m->m_epg_1st_off; usgl->addr0 = htobe64(pa); len = m_epg_pagelen(m, 0, m->m_epg_1st_off); pa += len; for (i = 1; i < m->m_epg_npgs; i++) { if (m->m_epg_pa[i] != pa) break; len += m_epg_pagelen(m, i, 0); pa += m_epg_pagelen(m, i, 0); } usgl->len0 = htobe32(len); #ifdef INVARIANTS nsegs--; #endif j = -1; for (; i < m->m_epg_npgs; i++) { if (j == -1 || m->m_epg_pa[i] != pa) { if (j >= 0) usgl->sge[j / 2].len[j & 1] = htobe32(len); j++; #ifdef INVARIANTS nsegs--; #endif pa = m->m_epg_pa[i]; usgl->sge[j / 2].addr[j & 1] = htobe64(pa); len = m_epg_pagelen(m, i, 0); pa += len; } else { len += m_epg_pagelen(m, i, 0); pa += m_epg_pagelen(m, i, 0); } } if (j >= 0) { usgl->sge[j / 2].len[j & 1] = htobe32(len); if ((j & 1) == 0) usgl->sge[j / 2].len[1] = htobe32(0); } KASSERT(nsegs == 0, ("%s: nsegs %d, m %p", __func__, nsegs, m)); } /* * Similar to t4_push_frames() but handles sockets that contain TLS * record mbufs. */ void t4_push_ktls(struct adapter *sc, struct toepcb *toep, int drop) { struct tls_hdr *thdr; struct fw_tlstx_data_wr *txwr; struct cpl_tx_tls_sfo *cpl; struct ulptx_idata *idata; struct ulptx_sc_memrd *memrd; struct wrqe *wr; struct mbuf *m; u_int nsegs, credits, wr_len; u_int expn_size; struct inpcb *inp = toep->inp; struct tcpcb *tp = intotcpcb(inp); struct socket *so = inp->inp_socket; struct sockbuf *sb = &so->so_snd; + struct mbufq *pduq = &toep->ulp_pduq; int tls_size, tx_credits, shove, sowwakeup; struct ofld_tx_sdesc *txsd; char *buf; INP_WLOCK_ASSERT(inp); KASSERT(toep->flags & TPF_FLOWC_WR_SENT, ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); KASSERT(ulp_mode(toep) == ULP_MODE_NONE || ulp_mode(toep) == ULP_MODE_TCPDDP || ulp_mode(toep) == ULP_MODE_TLS, ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); KASSERT(tls_tx_key(toep), ("%s: TX key not set for toep %p", __func__, toep)); #ifdef VERBOSE_TRACES CTR4(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d", __func__, toep->tid, toep->flags, tp->t_flags); #endif if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) return; #ifdef RATELIMIT if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) && (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) { inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; } #endif /* * This function doesn't resume by itself. Someone else must clear the * flag and call this function. */ if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { KASSERT(drop == 0, ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); return; } txsd = &toep->txsd[toep->txsd_pidx]; for (;;) { tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); + if (__predict_false((m = mbufq_first(pduq)) != NULL)) { + if (!t4_push_raw_wr(sc, toep, m)) { + toep->flags |= TPF_TX_SUSPENDED; + return; + } + + (void)mbufq_dequeue(pduq); + + txsd = &toep->txsd[toep->txsd_pidx]; + continue; + } + SOCKBUF_LOCK(sb); sowwakeup = drop; if (drop) { sbdrop_locked(sb, drop); drop = 0; } m = sb->sb_sndptr != NULL ? sb->sb_sndptr->m_next : sb->sb_mb; /* * Send a FIN if requested, but only if there's no * more data to send. */ if (m == NULL && toep->flags & TPF_SEND_FIN) { if (sowwakeup) sowwakeup_locked(so); else SOCKBUF_UNLOCK(sb); SOCKBUF_UNLOCK_ASSERT(sb); t4_close_conn(sc, toep); return; } /* * If there is no ready data to send, wait until more * data arrives. */ if (m == NULL || (m->m_flags & M_NOTREADY) != 0) { if (sowwakeup) sowwakeup_locked(so); else SOCKBUF_UNLOCK(sb); SOCKBUF_UNLOCK_ASSERT(sb); #ifdef VERBOSE_TRACES CTR2(KTR_CXGBE, "%s: tid %d no ready data to send", __func__, toep->tid); #endif return; } KASSERT(m->m_flags & M_EXTPG, ("%s: mbuf %p is not NOMAP", __func__, m)); KASSERT(m->m_epg_tls != NULL, ("%s: mbuf %p doesn't have TLS session", __func__, m)); /* Calculate WR length. */ wr_len = sizeof(struct fw_tlstx_data_wr) + sizeof(struct cpl_tx_tls_sfo) + sizeof(struct ulptx_idata) + sizeof(struct ulptx_sc_memrd); /* Explicit IVs for AES-CBC and AES-GCM are <= 16. */ MPASS(toep->tls.iv_len <= AES_BLOCK_LEN); wr_len += AES_BLOCK_LEN; /* Account for SGL in work request length. */ nsegs = count_ext_pgs_segs(m); wr_len += sizeof(struct ulptx_sgl) + ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; /* Not enough credits for this work request. */ if (howmany(wr_len, 16) > tx_credits) { if (sowwakeup) sowwakeup_locked(so); else SOCKBUF_UNLOCK(sb); SOCKBUF_UNLOCK_ASSERT(sb); #ifdef VERBOSE_TRACES CTR5(KTR_CXGBE, "%s: tid %d mbuf %p requires %d credits, but only %d available", __func__, toep->tid, m, howmany(wr_len, 16), tx_credits); #endif toep->flags |= TPF_TX_SUSPENDED; return; } /* Shove if there is no additional data pending. */ shove = ((m->m_next == NULL || (m->m_next->m_flags & M_NOTREADY) != 0)) && (tp->t_flags & TF_MORETOCOME) == 0; if (sb->sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf && sb->sb_hiwat < V_tcp_autosndbuf_max && sbused(sb) >= sb->sb_hiwat * 7 / 8) { int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc, V_tcp_autosndbuf_max); if (!sbreserve_locked(so, SO_SND, newsize, NULL)) sb->sb_flags &= ~SB_AUTOSIZE; else sowwakeup = 1; /* room available */ } if (sowwakeup) sowwakeup_locked(so); else SOCKBUF_UNLOCK(sb); SOCKBUF_UNLOCK_ASSERT(sb); if (__predict_false(toep->flags & TPF_FIN_SENT)) panic("%s: excess tx.", __func__); wr = alloc_wrqe(roundup2(wr_len, 16), &toep->ofld_txq->wrq); if (wr == NULL) { /* XXX: how will we recover from this? */ toep->flags |= TPF_TX_SUSPENDED; return; } thdr = (struct tls_hdr *)&m->m_epg_hdr; #ifdef VERBOSE_TRACES CTR5(KTR_CXGBE, "%s: tid %d TLS record %ju type %d len %#x", __func__, toep->tid, m->m_epg_seqno, thdr->type, m->m_len); #endif txwr = wrtod(wr); cpl = (struct cpl_tx_tls_sfo *)(txwr + 1); memset(txwr, 0, roundup2(wr_len, 16)); credits = howmany(wr_len, 16); expn_size = m->m_epg_hdrlen + m->m_epg_trllen; tls_size = m->m_len - expn_size; write_tlstx_wr(txwr, toep, tls_size, expn_size, credits, shove); write_tlstx_cpl(cpl, toep, thdr, tls_size, m->m_epg_seqno); idata = (struct ulptx_idata *)(cpl + 1); idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); idata->len = htobe32(0); memrd = (struct ulptx_sc_memrd *)(idata + 1); memrd->cmd_to_len = htobe32(V_ULPTX_CMD(ULP_TX_SC_MEMRD) | V_ULP_TX_SC_MORE(1) | V_ULPTX_LEN16(toep->tls.tx_key_info_size >> 4)); memrd->addr = htobe32(toep->tls.tx_key_addr >> 5); /* Copy IV. */ buf = (char *)(memrd + 1); memcpy(buf, thdr + 1, toep->tls.iv_len); buf += AES_BLOCK_LEN; write_ktlstx_sgl(buf, m, nsegs); KASSERT(toep->tx_credits >= credits, ("%s: not enough credits", __func__)); toep->tx_credits -= credits; tp->snd_nxt += m->m_len; tp->snd_max += m->m_len; SOCKBUF_LOCK(sb); sb->sb_sndptr = m; SOCKBUF_UNLOCK(sb); toep->flags |= TPF_TX_DATA_SENT; if (toep->tx_credits < MIN_OFLD_TLSTX_CREDITS(toep)) toep->flags |= TPF_TX_SUSPENDED; KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); KASSERT(m->m_len <= MAX_OFLD_TX_SDESC_PLEN, ("%s: plen %u too large", __func__, m->m_len)); txsd->plen = m->m_len; txsd->tx_credits = credits; txsd++; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { toep->txsd_pidx = 0; txsd = &toep->txsd[0]; } toep->txsd_avail--; counter_u64_add(toep->ofld_txq->tx_toe_tls_records, 1); counter_u64_add(toep->ofld_txq->tx_toe_tls_octets, m->m_len); t4_l2t_send(sc, wr, toep->l2te); } } /* * For TLS data we place received mbufs received via CPL_TLS_DATA into * an mbufq in the TLS offload state. When CPL_RX_TLS_CMP is * received, the completed PDUs are placed into the socket receive * buffer. * * The TLS code reuses the ulp_pdu_reclaimq to hold the pending mbufs. */ static int do_tls_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_tls_data *cpl = mtod(m, const void *); unsigned int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp = toep->inp; struct tcpcb *tp; int len; /* XXX: Should this match do_rx_data instead? */ KASSERT(!(toep->flags & TPF_SYNQE), ("%s: toep %p claims to be a synq entry", __func__, toep)); KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); /* strip off CPL header */ m_adj(m, sizeof(*cpl)); len = m->m_pkthdr.len; toep->ofld_rxq->rx_toe_tls_octets += len; KASSERT(len == G_CPL_TLS_DATA_LENGTH(be32toh(cpl->length_pkd)), ("%s: payload length mismatch", __func__)); INP_WLOCK(inp); if (inp->inp_flags & INP_DROPPED) { CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", __func__, tid, len, inp->inp_flags); INP_WUNLOCK(inp); m_freem(m); return (0); } /* Save TCP sequence number. */ m->m_pkthdr.tls_tcp_seq = be32toh(cpl->seq); if (mbufq_enqueue(&toep->ulp_pdu_reclaimq, m)) { #ifdef INVARIANTS panic("Failed to queue TLS data packet"); #else printf("%s: Failed to queue TLS data packet\n", __func__); INP_WUNLOCK(inp); m_freem(m); return (0); #endif } tp = intotcpcb(inp); tp->t_rcvtime = ticks; #ifdef VERBOSE_TRACES CTR4(KTR_CXGBE, "%s: tid %u len %d seq %u", __func__, tid, len, be32toh(cpl->seq)); #endif INP_WUNLOCK(inp); return (0); } static int do_rx_tls_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_rx_tls_cmp *cpl = mtod(m, const void *); struct tlsrx_hdr_pkt *tls_hdr_pkt; unsigned int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp = toep->inp; struct tcpcb *tp; struct socket *so; struct sockbuf *sb; struct mbuf *tls_data; struct tls_get_record *tgr; struct mbuf *control; int pdu_length, trailer_len; #if defined(KTR) || defined(INVARIANTS) int len; #endif KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); KASSERT(!(toep->flags & TPF_SYNQE), ("%s: toep %p claims to be a synq entry", __func__, toep)); /* strip off CPL header */ m_adj(m, sizeof(*cpl)); #if defined(KTR) || defined(INVARIANTS) len = m->m_pkthdr.len; #endif toep->ofld_rxq->rx_toe_tls_records++; KASSERT(len == G_CPL_RX_TLS_CMP_LENGTH(be32toh(cpl->pdulength_length)), ("%s: payload length mismatch", __func__)); INP_WLOCK(inp); if (inp->inp_flags & INP_DROPPED) { CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", __func__, tid, len, inp->inp_flags); INP_WUNLOCK(inp); m_freem(m); return (0); } pdu_length = G_CPL_RX_TLS_CMP_PDULENGTH(be32toh(cpl->pdulength_length)); so = inp_inpcbtosocket(inp); tp = intotcpcb(inp); #ifdef VERBOSE_TRACES CTR6(KTR_CXGBE, "%s: tid %u PDU len %d len %d seq %u, rcv_nxt %u", __func__, tid, pdu_length, len, be32toh(cpl->seq), tp->rcv_nxt); #endif tp->rcv_nxt += pdu_length; KASSERT(tp->rcv_wnd >= pdu_length, ("%s: negative window size", __func__)); tp->rcv_wnd -= pdu_length; /* XXX: Not sure what to do about urgent data. */ /* * The payload of this CPL is the TLS header followed by * additional fields. */ KASSERT(m->m_len >= sizeof(*tls_hdr_pkt), ("%s: payload too small", __func__)); tls_hdr_pkt = mtod(m, void *); tls_data = mbufq_dequeue(&toep->ulp_pdu_reclaimq); if (tls_data != NULL) { KASSERT(be32toh(cpl->seq) == tls_data->m_pkthdr.tls_tcp_seq, ("%s: sequence mismatch", __func__)); } /* Report decryption errors as EBADMSG. */ if ((tls_hdr_pkt->res_to_mac_error & M_TLSRX_HDR_PKT_ERROR) != 0) { CTR4(KTR_CXGBE, "%s: tid %u TLS error %#x ddp_vld %#x", __func__, toep->tid, tls_hdr_pkt->res_to_mac_error, be32toh(cpl->ddp_valid)); m_freem(m); m_freem(tls_data); CURVNET_SET(toep->vnet); so->so_error = EBADMSG; sorwakeup(so); INP_WUNLOCK(inp); CURVNET_RESTORE(); return (0); } /* Handle data received after the socket is closed. */ sb = &so->so_rcv; SOCKBUF_LOCK(sb); if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) { struct epoch_tracker et; CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)", __func__, tid, pdu_length); m_freem(m); m_freem(tls_data); SOCKBUF_UNLOCK(sb); INP_WUNLOCK(inp); CURVNET_SET(toep->vnet); NET_EPOCH_ENTER(et); INP_WLOCK(inp); tp = tcp_drop(tp, ECONNRESET); if (tp != NULL) INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } /* * If there is any data in the 'sb_mtls' chain of the socket * or we aren't able to allocate the control mbuf, append the * record as a CSUM_TLS_DECRYPTED packet to 'sb_mtls' rather * than as a decrypted record to 'sb_m'. */ if (sb->sb_mtls != NULL) control = NULL; else control = sbcreatecontrol(NULL, sizeof(*tgr), TLS_GET_RECORD, IPPROTO_TCP, M_NOWAIT); if (control != NULL) { tgr = (struct tls_get_record *) CMSG_DATA(mtod(control, struct cmsghdr *)); memset(tgr, 0, sizeof(*tgr)); tgr->tls_type = tls_hdr_pkt->type; tgr->tls_vmajor = be16toh(tls_hdr_pkt->version) >> 8; tgr->tls_vminor = be16toh(tls_hdr_pkt->version) & 0xff; if (tls_data != NULL) { m_last(tls_data)->m_flags |= M_EOR; tgr->tls_length = htobe16(tls_data->m_pkthdr.len); } else tgr->tls_length = 0; m_freem(m); m = tls_data; } else { M_ASSERTPKTHDR(m); /* It's ok that any explicit IV is missing. */ m->m_len = sb->sb_tls_info->params.tls_hlen; m->m_pkthdr.csum_flags |= CSUM_TLS_DECRYPTED; m->m_pkthdr.len = m->m_len; if (tls_data != NULL) { m->m_pkthdr.len += tls_data->m_pkthdr.len; m_demote_pkthdr(tls_data); m->m_next = tls_data; } /* * Grow the chain by the trailer, but without * contents. The trailer will be thrown away by * ktls_decrypt. Note that ktls_decrypt assumes the * trailer is tls_tlen bytes long, so append that many * bytes not the actual trailer size computed from * pdu_length. */ trailer_len = sb->sb_tls_info->params.tls_tlen; if (tls_data != NULL) { m_last(tls_data)->m_len += trailer_len; tls_data = NULL; } else m->m_len += trailer_len; m->m_pkthdr.len += trailer_len; tls_hdr_pkt->length = htobe16(m->m_pkthdr.len - sizeof(struct tls_record_layer)); } /* receive buffer autosize */ MPASS(toep->vnet == so->so_vnet); CURVNET_SET(toep->vnet); if (sb->sb_flags & SB_AUTOSIZE && V_tcp_do_autorcvbuf && sb->sb_hiwat < V_tcp_autorcvbuf_max && m->m_pkthdr.len > (sbspace(sb) / 8 * 7)) { unsigned int hiwat = sb->sb_hiwat; unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc, V_tcp_autorcvbuf_max); if (!sbreserve_locked(so, SO_RCV, newsize, NULL)) sb->sb_flags &= ~SB_AUTOSIZE; } if (control != NULL) sbappendcontrol_locked(sb, m, control, 0); else sbappendstream_locked(sb, m, 0); t4_rcvd_locked(&toep->td->tod, tp); sorwakeup_locked(so); SOCKBUF_UNLOCK_ASSERT(sb); INP_WUNLOCK(inp); CURVNET_RESTORE(); return (0); } void do_rx_data_tls(const struct cpl_rx_data *cpl, struct toepcb *toep, struct mbuf *m) { struct inpcb *inp = toep->inp; struct tls_ofld_info *tls_ofld = &toep->tls; struct tls_hdr *hdr; struct tcpcb *tp; struct socket *so; struct sockbuf *sb; int len; len = m->m_pkthdr.len; INP_WLOCK_ASSERT(inp); so = inp_inpcbtosocket(inp); tp = intotcpcb(inp); sb = &so->so_rcv; SOCKBUF_LOCK(sb); CURVNET_SET(toep->vnet); tp->rcv_nxt += len; KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__)); tp->rcv_wnd -= len; /* Do we have a full TLS header? */ if (len < sizeof(*hdr)) { CTR3(KTR_CXGBE, "%s: tid %u len %d: too short for a TLS header", __func__, toep->tid, len); so->so_error = EMSGSIZE; goto out; } hdr = mtod(m, struct tls_hdr *); /* Is the header valid? */ if (be16toh(hdr->version) != tls_ofld->rx_version) { CTR3(KTR_CXGBE, "%s: tid %u invalid version %04x", __func__, toep->tid, be16toh(hdr->version)); so->so_error = EINVAL; goto out; } if (be16toh(hdr->length) < sizeof(*hdr)) { CTR3(KTR_CXGBE, "%s: tid %u invalid length %u", __func__, toep->tid, be16toh(hdr->length)); so->so_error = EBADMSG; goto out; } /* Did we get a truncated record? */ if (len < be16toh(hdr->length)) { CTR4(KTR_CXGBE, "%s: tid %u truncated TLS record (%d vs %u)", __func__, toep->tid, len, be16toh(hdr->length)); so->so_error = EMSGSIZE; goto out; } /* Is the header type unknown? */ switch (hdr->type) { case CONTENT_TYPE_CCS: case CONTENT_TYPE_ALERT: case CONTENT_TYPE_APP_DATA: case CONTENT_TYPE_HANDSHAKE: break; default: CTR3(KTR_CXGBE, "%s: tid %u invalid TLS record type %u", __func__, toep->tid, hdr->type); so->so_error = EBADMSG; goto out; } /* * Just punt. Although this could fall back to software * decryption, this case should never really happen. */ CTR4(KTR_CXGBE, "%s: tid %u dropping TLS record type %u, length %u", __func__, toep->tid, hdr->type, be16toh(hdr->length)); so->so_error = EBADMSG; out: sorwakeup_locked(so); SOCKBUF_UNLOCK_ASSERT(sb); INP_WUNLOCK(inp); CURVNET_RESTORE(); m_freem(m); } /* * Send a work request setting multiple TCB fields to enable * ULP_MODE_TLS. */ static void tls_update_tcb(struct adapter *sc, struct toepcb *toep, uint64_t seqno) { struct wrqe *wr; struct work_request_hdr *wrh; struct ulp_txpkt *ulpmc; int fields, key_offset, len; KASSERT(ulp_mode(toep) == ULP_MODE_NONE, ("%s: tid %d already ULP_MODE_TLS", __func__, toep->tid)); fields = 0; /* 2 writes for the overlay region */ fields += 2; /* W_TCB_TLS_SEQ */ fields++; /* W_TCB_ULP_RAW */ fields++; /* W_TCB_ULP_TYPE */ fields ++; /* W_TCB_T_FLAGS */ fields++; len = sizeof(*wrh) + fields * roundup2(LEN__SET_TCB_FIELD_ULP, 16); KASSERT(len <= SGE_MAX_WR_LEN, ("%s: WR with %d TCB field updates too large", __func__, fields)); wr = alloc_wrqe(len, toep->ctrlq); if (wr == NULL) { /* XXX */ panic("%s: out of memory", __func__); } wrh = wrtod(wr); INIT_ULPTX_WRH(wrh, len, 1, 0); /* atomic */ ulpmc = (struct ulp_txpkt *)(wrh + 1); /* * Clear the TLS overlay region: 1023:832. * * Words 26/27 are always set to zero. Words 28/29 * contain seqno and are set when enabling TLS * decryption. Word 30 is zero and Word 31 contains * the keyid. */ ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, 26, 0xffffffffffffffff, 0); /* * RX key tags are an index into the key portion of MA * memory stored as an offset from the base address in * units of 64 bytes. */ key_offset = toep->tls.rx_key_addr - sc->vres.key.start; ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, 30, 0xffffffffffffffff, (uint64_t)V_TCB_RX_TLS_KEY_TAG(key_offset / 64) << 32); CTR3(KTR_CXGBE, "%s: tid %d enable TLS seqno %lu", __func__, toep->tid, seqno); ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_TLS_SEQ, V_TCB_TLS_SEQ(M_TCB_TLS_SEQ), V_TCB_TLS_SEQ(seqno)); ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_ULP_RAW, V_TCB_ULP_RAW(M_TCB_ULP_RAW), V_TCB_ULP_RAW((V_TF_TLS_KEY_SIZE(3) | V_TF_TLS_CONTROL(1) | V_TF_TLS_ACTIVE(1) | V_TF_TLS_ENABLE(1)))); toep->flags &= ~TPF_TLS_STARTING; toep->flags |= TPF_TLS_RECEIVE; /* Set the ULP mode to ULP_MODE_TLS. */ toep->params.ulp_mode = ULP_MODE_TLS; ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_ULP_TYPE, V_TCB_ULP_TYPE(M_TCB_ULP_TYPE), V_TCB_ULP_TYPE(ULP_MODE_TLS)); /* Clear TF_RX_QUIESCE. */ ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_T_FLAGS, V_TF_RX_QUIESCE(1), 0); t4_wrq_tx(sc, wr); } /* * Examine the pending data in the socket buffer and either enable TLS * RX or request more encrypted data. */ static void tls_check_rx_sockbuf(struct adapter *sc, struct toepcb *toep, struct sockbuf *sb) { uint64_t seqno; size_t resid; bool have_header; SOCKBUF_LOCK_ASSERT(sb); MPASS(toep->tls.rx_resid == 0); have_header = ktls_pending_rx_info(sb, &seqno, &resid); CTR5(KTR_CXGBE, "%s: tid %d have_header %d seqno %lu resid %zu", __func__, toep->tid, have_header, seqno, resid); /* * If we have a partial header or we need fewer bytes than the * size of a TLS record, re-enable receive and pause again once * we get more data to try again. */ if (!have_header || resid != 0) { CTR(KTR_CXGBE, "%s: tid %d waiting for more data", __func__, toep->tid); toep->flags &= ~TPF_TLS_RX_QUIESCED; t4_clear_rx_quiesce(toep); return; } tls_update_tcb(sc, toep, seqno); } void tls_received_starting_data(struct adapter *sc, struct toepcb *toep, struct sockbuf *sb, int len) { MPASS(toep->flags & TPF_TLS_STARTING); /* Data was received before quiescing took effect. */ if ((toep->flags & TPF_TLS_RX_QUIESCING) != 0) return; /* * A previous call to tls_check_rx_sockbuf needed more data. * Now that more data has arrived, quiesce receive again and * check the state once the quiesce has completed. */ if ((toep->flags & TPF_TLS_RX_QUIESCED) == 0) { CTR(KTR_CXGBE, "%s: tid %d quiescing", __func__, toep->tid); toep->flags |= TPF_TLS_RX_QUIESCING; t4_set_rx_quiesce(toep); return; } KASSERT(len <= toep->tls.rx_resid, ("%s: received excess bytes %d (waiting for %zu)", __func__, len, toep->tls.rx_resid)); toep->tls.rx_resid -= len; if (toep->tls.rx_resid != 0) return; tls_check_rx_sockbuf(sc, toep, sb); } static int do_tls_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_set_tcb_rpl *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct toepcb *toep; struct inpcb *inp; struct socket *so; struct sockbuf *sb; if (cpl->status != CPL_ERR_NONE) panic("XXX: tcp_rpl failed: %d", cpl->status); toep = lookup_tid(sc, tid); inp = toep->inp; switch (cpl->cookie) { case V_WORD(W_TCB_T_FLAGS) | V_COOKIE(CPL_COOKIE_TOM): INP_WLOCK(inp); if ((toep->flags & TPF_TLS_STARTING) == 0) panic("%s: connection is not starting TLS RX\n", __func__); MPASS((toep->flags & TPF_TLS_RX_QUIESCING) != 0); toep->flags &= ~TPF_TLS_RX_QUIESCING; toep->flags |= TPF_TLS_RX_QUIESCED; so = inp->inp_socket; sb = &so->so_rcv; SOCKBUF_LOCK(sb); tls_check_rx_sockbuf(sc, toep, sb); SOCKBUF_UNLOCK(sb); INP_WUNLOCK(inp); break; default: panic("XXX: unknown tcb_rpl offset %#x, cookie %#x", G_WORD(cpl->cookie), G_COOKIE(cpl->cookie)); } return (0); } void t4_tls_mod_load(void) { t4_register_cpl_handler(CPL_TLS_DATA, do_tls_data); t4_register_cpl_handler(CPL_RX_TLS_CMP, do_rx_tls_cmp); t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, do_tls_tcb_rpl, CPL_COOKIE_TOM); } void t4_tls_mod_unload(void) { t4_register_cpl_handler(CPL_TLS_DATA, NULL); t4_register_cpl_handler(CPL_RX_TLS_CMP, NULL); t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, NULL, CPL_COOKIE_TOM); } #endif /* TCP_OFFLOAD */ #endif /* KERN_TLS */ diff --git a/sys/dev/cxgbe/tom/t4_tom.h b/sys/dev/cxgbe/tom/t4_tom.h index 3b2243aeb69f..3dfa24a33f85 100644 --- a/sys/dev/cxgbe/tom/t4_tom.h +++ b/sys/dev/cxgbe/tom/t4_tom.h @@ -1,580 +1,582 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2012, 2015 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #ifndef __T4_TOM_H__ #define __T4_TOM_H__ #include #include "common/t4_hw.h" #include "common/t4_msg.h" #include "tom/t4_tls.h" #define LISTEN_HASH_SIZE 32 /* * Min receive window. We want it to be large enough to accommodate receive * coalescing, handle jumbo frames, and not trigger sender SWS avoidance. */ #define MIN_RCV_WND (24 * 1024U) /* * Max receive window supported by HW in bytes. Only a small part of it can * be set through option0, the rest needs to be set through RX_DATA_ACK. */ #define MAX_RCV_WND ((1U << 27) - 1) #define DDP_RSVD_WIN (16 * 1024U) #define SB_DDP_INDICATE SB_IN_TOE /* soreceive must respond to indicate */ #define USE_DDP_RX_FLOW_CONTROL #define PPOD_SZ(n) ((n) * sizeof(struct pagepod)) #define PPOD_SIZE (PPOD_SZ(1)) /* TOE PCB flags */ enum { TPF_ATTACHED = (1 << 0), /* a tcpcb refers to this toepcb */ TPF_FLOWC_WR_SENT = (1 << 1), /* firmware flow context WR sent */ TPF_TX_DATA_SENT = (1 << 2), /* some data sent */ TPF_TX_SUSPENDED = (1 << 3), /* tx suspended for lack of resources */ TPF_SEND_FIN = (1 << 4), /* send FIN after all pending data */ TPF_FIN_SENT = (1 << 5), /* FIN has been sent */ TPF_ABORT_SHUTDOWN = (1 << 6), /* connection abort is in progress */ TPF_CPL_PENDING = (1 << 7), /* haven't received the last CPL */ TPF_SYNQE = (1 << 8), /* synq_entry, not really a toepcb */ TPF_SYNQE_EXPANDED = (1 << 9), /* toepcb ready, tid context updated */ TPF_TLS_STARTING = (1 << 10), /* starting TLS receive */ TPF_KTLS = (1 << 11), /* send TLS records from KTLS */ TPF_INITIALIZED = (1 << 12), /* init_toepcb has been called */ TPF_TLS_RECEIVE = (1 << 13), /* should receive TLS records */ TPF_TLS_RX_QUIESCING = (1 << 14), /* RX quiesced for TLS RX startup */ TPF_TLS_RX_QUIESCED = (1 << 15), /* RX quiesced for TLS RX startup */ TPF_WAITING_FOR_FINAL = (1<< 16), /* waiting for wakeup on final CPL */ TPF_IN_TOEP_LIST = (1 << 17), /* toep is in the main td->toep_list */ }; enum { DDP_OK = (1 << 0), /* OK to turn on DDP */ DDP_SC_REQ = (1 << 1), /* state change (on/off) requested */ DDP_ON = (1 << 2), /* DDP is turned on */ DDP_BUF0_ACTIVE = (1 << 3), /* buffer 0 in use (not invalidated) */ DDP_BUF1_ACTIVE = (1 << 4), /* buffer 1 in use (not invalidated) */ DDP_TASK_ACTIVE = (1 << 5), /* requeue task is queued / running */ DDP_DEAD = (1 << 6), /* toepcb is shutting down */ DDP_AIO = (1 << 7), /* DDP used for AIO, not so_rcv */ DDP_RCVBUF = (1 << 8), /* DDP used for so_rcv, not AIO */ }; struct bio; struct ctl_sg_entry; struct sockopt; struct offload_settings; /* * Connection parameters for an offloaded connection. These are mostly (but not * all) hardware TOE parameters. */ struct conn_params { int8_t rx_coalesce; int8_t cong_algo; int8_t tc_idx; int8_t tstamp; int8_t sack; int8_t nagle; int8_t keepalive; int8_t wscale; int8_t ecn; int8_t mtu_idx; int8_t ulp_mode; int8_t tx_align; int16_t txq_idx; /* ofld_txq = &sc->sge.ofld_txq[txq_idx] */ int16_t rxq_idx; /* ofld_rxq = &sc->sge.ofld_rxq[rxq_idx] */ int16_t l2t_idx; uint16_t emss; uint16_t opt0_bufsize; u_int sndbuf; /* controls TP tx pages */ }; struct ofld_tx_sdesc { uint32_t plen : 26; /* payload length */ uint32_t tx_credits : 6; /* firmware tx credits (unit is 16B) */ }; #define MAX_OFLD_TX_SDESC_PLEN ((1u << 26) - 1) #define MAX_OFLD_TX_SDESC_CREDITS ((1u << 6) - 1) struct ppod_region { u_int pr_start; u_int pr_len; u_int pr_page_shift[4]; uint32_t pr_tag_mask; /* hardware tagmask for this region. */ uint32_t pr_invalid_bit; /* OR with this to invalidate tag. */ uint32_t pr_alias_mask; /* AND with tag to get alias bits. */ u_int pr_alias_shift; /* shift this much for first alias bit. */ vmem_t *pr_arena; }; struct ppod_reservation { struct ppod_region *prsv_pr; uint32_t prsv_tag; /* Full tag: pgsz, alias, tag, color */ u_int prsv_nppods; }; struct pageset { TAILQ_ENTRY(pageset) link; vm_page_t *pages; int npages; int flags; int offset; /* offset in first page */ int len; struct ppod_reservation prsv; struct vmspace *vm; vm_offset_t start; u_int vm_timestamp; }; TAILQ_HEAD(pagesetq, pageset); #define PS_PPODS_WRITTEN 0x0001 /* Page pods written to the card. */ struct ddp_rcv_buffer { TAILQ_ENTRY(ddp_rcv_buffer) link; void *buf; struct ppod_reservation prsv; size_t len; u_int refs; }; struct ddp_buffer { union { /* DDP_AIO fields */ struct { struct pageset *ps; struct kaiocb *job; int cancel_pending; }; /* DDP_RCVBUF fields */ struct { struct ddp_rcv_buffer *drb; uint32_t placed; }; }; }; /* * (a) - DDP_AIO only * (r) - DDP_RCVBUF only */ struct ddp_pcb { struct mtx lock; u_int flags; int active_id; /* the currently active DDP buffer */ struct ddp_buffer db[2]; union { TAILQ_HEAD(, pageset) cached_pagesets; /* (a) */ TAILQ_HEAD(, ddp_rcv_buffer) cached_buffers; /* (r) */ }; TAILQ_HEAD(, kaiocb) aiojobq; /* (a) */ u_int waiting_count; /* (a) */ u_int active_count; u_int cached_count; struct task requeue_task; struct kaiocb *queueing; /* (a) */ struct mtx cache_lock; /* (r) */ }; struct toepcb { struct tom_data *td; struct inpcb *inp; /* backpointer to host stack's PCB */ u_int flags; /* miscellaneous flags */ TAILQ_ENTRY(toepcb) link; /* toep_list or stranded_toep_list */ int refcount; struct vnet *vnet; struct vi_info *vi; /* virtual interface */ struct sge_ofld_txq *ofld_txq; struct sge_ofld_rxq *ofld_rxq; struct sge_wrq *ctrlq; struct l2t_entry *l2te; /* L2 table entry used by this connection */ struct clip_entry *ce; /* CLIP table entry used by this tid */ int tid; /* Connection identifier */ int incarnation; /* sc->incarnation when toepcb was allocated */ /* tx credit handling */ u_int tx_total; /* total tx WR credits (in 16B units) */ u_int tx_credits; /* tx WR credits (in 16B units) available */ u_int tx_nocompl; /* tx WR credits since last compl request */ u_int plen_nocompl; /* payload since last compl request */ struct conn_params params; void *ulpcb; void *ulpcb2; struct mbufq ulp_pduq; /* PDUs waiting to be sent out. */ struct mbufq ulp_pdu_reclaimq; struct ddp_pcb ddp; struct tls_ofld_info tls; TAILQ_HEAD(, kaiocb) aiotx_jobq; struct task aiotx_task; struct socket *aiotx_so; /* Tx software descriptor */ uint8_t txsd_total; uint8_t txsd_pidx; uint8_t txsd_cidx; uint8_t txsd_avail; struct ofld_tx_sdesc txsd[]; }; static inline int ulp_mode(struct toepcb *toep) { return (toep->params.ulp_mode); } #define DDP_LOCK(toep) mtx_lock(&(toep)->ddp.lock) #define DDP_UNLOCK(toep) mtx_unlock(&(toep)->ddp.lock) #define DDP_ASSERT_LOCKED(toep) mtx_assert(&(toep)->ddp.lock, MA_OWNED) #define DDP_CACHE_LOCK(toep) mtx_lock(&(toep)->ddp.cache_lock) #define DDP_CACHE_UNLOCK(toep) mtx_unlock(&(toep)->ddp.cache_lock) /* * Compressed state for embryonic connections for a listener. */ struct synq_entry { struct listen_ctx *lctx; /* backpointer to listen ctx */ struct mbuf *syn; int flags; /* same as toepcb's tp_flags */ TAILQ_ENTRY(synq_entry) link; /* synqe_list */ volatile int ok_to_respond; volatile u_int refcnt; int tid; uint32_t iss; uint32_t irs; uint32_t ts; uint32_t rss_hash; __be16 tcp_opt; /* from cpl_pass_establish */ int incarnation; struct toepcb *toep; struct conn_params params; }; /* listen_ctx flags */ #define LCTX_RPL_PENDING 1 /* waiting for a CPL_PASS_OPEN_RPL */ #define LCTX_SETUP_IN_HW 2 /* stid entry is setup in hardware */ struct listen_ctx { LIST_ENTRY(listen_ctx) link; /* listen hash linkage */ volatile int refcount; int stid; int flags; bool isipv6; struct inpcb *inp; /* listening socket's inp */ struct vnet *vnet; struct sge_wrq *ctrlq; struct sge_ofld_rxq *ofld_rxq; struct clip_entry *ce; }; /* tcb_histent flags */ #define TE_RPL_PENDING 1 #define TE_ACTIVE 2 /* bits in one 8b tcb_histent sample. */ #define TS_RTO (1 << 0) #define TS_DUPACKS (1 << 1) #define TS_FASTREXMT (1 << 2) #define TS_SND_BACKLOGGED (1 << 3) #define TS_CWND_LIMITED (1 << 4) #define TS_ECN_ECE (1 << 5) #define TS_ECN_CWR (1 << 6) #define TS_RESERVED (1 << 7) /* Unused. */ struct tcb_histent { struct mtx te_lock; struct callout te_callout; uint64_t te_tcb[TCB_SIZE / sizeof(uint64_t)]; struct adapter *te_adapter; u_int te_flags; u_int te_tid; uint8_t te_pidx; uint8_t te_sample[100]; }; struct tom_data { struct toedev tod; /* toepcb's associated with this TOE device */ struct mtx toep_list_lock; TAILQ_HEAD(, toepcb) toep_list; TAILQ_HEAD(, synq_entry) synqe_list; /* List of tids left stranded because hw stopped abruptly. */ TAILQ_HEAD(, toepcb) stranded_atids; TAILQ_HEAD(, toepcb) stranded_tids; TAILQ_HEAD(, synq_entry) stranded_synqe; struct task cleanup_stranded_tids; struct mtx lctx_hash_lock; LIST_HEAD(, listen_ctx) *listen_hash; u_long listen_mask; int lctx_count; /* # of lctx in the hash table */ struct ppod_region pr; struct rwlock tcb_history_lock __aligned(CACHE_LINE_SIZE); struct tcb_histent **tcb_history; int dupack_threshold; /* WRs that will not be sent to the chip because L2 resolution failed */ struct mtx unsent_wr_lock; STAILQ_HEAD(, wrqe) unsent_wr_list; struct task reclaim_wr_resources; }; static inline struct tom_data * tod_td(struct toedev *tod) { return (__containerof(tod, struct tom_data, tod)); } static inline struct adapter * td_adapter(struct tom_data *td) { return (td->tod.tod_softc); } static inline void set_mbuf_raw_wr(struct mbuf *m, bool raw) { M_ASSERTPKTHDR(m); m->m_pkthdr.PH_per.eight[6] = raw; } static inline bool mbuf_raw_wr(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_pkthdr.PH_per.eight[6]); } static inline void set_mbuf_ulp_submode(struct mbuf *m, uint8_t ulp_submode) { M_ASSERTPKTHDR(m); m->m_pkthdr.PH_per.eight[0] = ulp_submode; } static inline uint8_t mbuf_ulp_submode(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_pkthdr.PH_per.eight[0]); } static inline void set_mbuf_iscsi_iso(struct mbuf *m, bool iso) { M_ASSERTPKTHDR(m); m->m_pkthdr.PH_per.eight[1] = iso; } static inline bool mbuf_iscsi_iso(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_pkthdr.PH_per.eight[1]); } /* Flags for iSCSI segmentation offload. */ #define CXGBE_ISO_TYPE(flags) ((flags) & 0x3) #define CXGBE_ISO_F 0x4 static inline void set_mbuf_iscsi_iso_flags(struct mbuf *m, uint8_t flags) { M_ASSERTPKTHDR(m); m->m_pkthdr.PH_per.eight[2] = flags; } static inline uint8_t mbuf_iscsi_iso_flags(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_pkthdr.PH_per.eight[2]); } static inline void set_mbuf_iscsi_iso_mss(struct mbuf *m, uint16_t mss) { M_ASSERTPKTHDR(m); m->m_pkthdr.PH_per.sixteen[2] = mss; } static inline uint16_t mbuf_iscsi_iso_mss(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_pkthdr.PH_per.sixteen[2]); } /* t4_tom.c */ struct toepcb *alloc_toepcb(struct vi_info *, int); int init_toepcb(struct vi_info *, struct toepcb *); struct toepcb *hold_toepcb(struct toepcb *); void free_toepcb(struct toepcb *); void offload_socket(struct socket *, struct toepcb *); void restore_so_proto(struct socket *, bool); void undo_offload_socket(struct socket *); void final_cpl_received(struct toepcb *); void insert_tid(struct adapter *, int, void *, int); void *lookup_tid(struct adapter *, int); void update_tid(struct adapter *, int, void *); void remove_tid(struct adapter *, int, int); u_long select_rcv_wnd(struct socket *); int select_rcv_wscale(void); void init_conn_params(struct vi_info *, struct offload_settings *, struct in_conninfo *, struct socket *, const struct tcp_options *, int16_t, struct conn_params *cp); __be64 calc_options0(struct vi_info *, struct conn_params *); __be32 calc_options2(struct vi_info *, struct conn_params *); uint64_t select_ntuple(struct vi_info *, struct l2t_entry *); int negative_advice(int); int add_tid_to_history(struct adapter *, u_int); void t4_pcb_detach(struct toedev *, struct tcpcb *); /* t4_connect.c */ void t4_init_connect_cpl_handlers(void); void t4_uninit_connect_cpl_handlers(void); int t4_connect(struct toedev *, struct socket *, struct nhop_object *, struct sockaddr *); void act_open_failure_cleanup(struct adapter *, struct toepcb *, u_int); /* t4_listen.c */ void t4_init_listen_cpl_handlers(void); void t4_uninit_listen_cpl_handlers(void); int t4_listen_start(struct toedev *, struct tcpcb *); int t4_listen_stop(struct toedev *, struct tcpcb *); void t4_syncache_added(struct toedev *, void *); void t4_syncache_removed(struct toedev *, void *); int t4_syncache_respond(struct toedev *, void *, struct mbuf *); int do_abort_req_synqe(struct sge_iq *, const struct rss_header *, struct mbuf *); int do_abort_rpl_synqe(struct sge_iq *, const struct rss_header *, struct mbuf *); void t4_offload_socket(struct toedev *, void *, struct socket *); void synack_failure_cleanup(struct adapter *, struct synq_entry *); int alloc_stid_tab(struct adapter *); void free_stid_tab(struct adapter *); void stop_stid_tab(struct adapter *); void restart_stid_tab(struct adapter *); /* t4_cpl_io.c */ void aiotx_init_toep(struct toepcb *); int t4_aio_queue_aiotx(struct socket *, struct kaiocb *); void t4_init_cpl_io_handlers(void); void t4_uninit_cpl_io_handlers(void); void send_abort_rpl(struct adapter *, struct sge_ofld_txq *, int , int); void send_flowc_wr(struct toepcb *, struct tcpcb *); void send_reset(struct adapter *, struct toepcb *, uint32_t); int send_rx_credits(struct adapter *, struct toepcb *, int); void make_established(struct toepcb *, uint32_t, uint32_t, uint16_t); int t4_close_conn(struct adapter *, struct toepcb *); void t4_rcvd(struct toedev *, struct tcpcb *); void t4_rcvd_locked(struct toedev *, struct tcpcb *); int t4_tod_output(struct toedev *, struct tcpcb *); int t4_send_fin(struct toedev *, struct tcpcb *); int t4_send_rst(struct toedev *, struct tcpcb *); void t4_set_tcb_field(struct adapter *, struct sge_wrq *, struct toepcb *, uint16_t, uint64_t, uint64_t, int, int); void t4_push_pdus(struct adapter *, struct toepcb *, int); +bool t4_push_raw_wr(struct adapter *, struct toepcb *, struct mbuf *); +void t4_raw_wr_tx(struct adapter *, struct toepcb *, struct mbuf *); /* t4_ddp.c */ int t4_init_ppod_region(struct ppod_region *, struct t4_range *, u_int, const char *); void t4_free_ppod_region(struct ppod_region *); int t4_alloc_page_pods_for_ps(struct ppod_region *, struct pageset *); int t4_alloc_page_pods_for_bio(struct ppod_region *, struct bio *, struct ppod_reservation *); int t4_alloc_page_pods_for_buf(struct ppod_region *, vm_offset_t, int, struct ppod_reservation *); int t4_alloc_page_pods_for_sgl(struct ppod_region *, struct ctl_sg_entry *, int, struct ppod_reservation *); int t4_write_page_pods_for_ps(struct adapter *, struct sge_wrq *, int, struct pageset *); int t4_write_page_pods_for_bio(struct adapter *, struct toepcb *, struct ppod_reservation *, struct bio *, struct mbufq *); int t4_write_page_pods_for_buf(struct adapter *, struct toepcb *, struct ppod_reservation *, vm_offset_t, int, struct mbufq *); int t4_write_page_pods_for_sgl(struct adapter *, struct toepcb *, struct ppod_reservation *, struct ctl_sg_entry *, int, int, struct mbufq *); void t4_free_page_pods(struct ppod_reservation *); int t4_aio_queue_ddp(struct socket *, struct kaiocb *); int t4_enable_ddp_rcv(struct socket *, struct toepcb *); void t4_ddp_mod_load(void); void t4_ddp_mod_unload(void); void ddp_assert_empty(struct toepcb *); void ddp_uninit_toep(struct toepcb *); void ddp_queue_toep(struct toepcb *); void release_ddp_resources(struct toepcb *toep); void handle_ddp_close(struct toepcb *, struct tcpcb *, uint32_t); void handle_ddp_indicate(struct toepcb *); void insert_ddp_data(struct toepcb *, uint32_t); const struct offload_settings *lookup_offload_policy(struct adapter *, int, struct mbuf *, uint16_t, struct inpcb *); /* t4_tls.c */ bool can_tls_offload(struct adapter *); void do_rx_data_tls(const struct cpl_rx_data *, struct toepcb *, struct mbuf *); void t4_push_ktls(struct adapter *, struct toepcb *, int); void tls_received_starting_data(struct adapter *, struct toepcb *, struct sockbuf *, int); void t4_tls_mod_load(void); void t4_tls_mod_unload(void); void tls_init_toep(struct toepcb *); int tls_tx_key(struct toepcb *); void tls_uninit_toep(struct toepcb *); int tls_alloc_ktls(struct toepcb *, struct ktls_session *, int); #endif