Index: head/sys/dev/cxgbe/tom/t4_cpl_io.c =================================================================== --- head/sys/dev/cxgbe/tom/t4_cpl_io.c (revision 366853) +++ head/sys/dev/cxgbe/tom/t4_cpl_io.c (revision 366854) @@ -1,2304 +1,2294 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012, 2015 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_ratelimit.h" #ifdef TCP_OFFLOAD #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include #include #include #include #include #include #include #include #include #include #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_tcb.h" #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" static void t4_aiotx_cancel(struct kaiocb *job); static void t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep); void send_flowc_wr(struct toepcb *toep, struct tcpcb *tp) { struct wrqe *wr; struct fw_flowc_wr *flowc; unsigned int nparams, flowclen, paramidx; struct vi_info *vi = toep->vi; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; unsigned int pfvf = sc->pf << S_FW_VIID_PFN; struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT), ("%s: flowc for tid %u sent already", __func__, toep->tid)); if (tp != NULL) nparams = 8; else nparams = 6; if (ulp_mode(toep) == ULP_MODE_TLS) nparams++; if (toep->tls.fcplenmax != 0) nparams++; if (toep->params.tc_idx != -1) { MPASS(toep->params.tc_idx >= 0 && toep->params.tc_idx < sc->chip_params->nsched_cls); nparams++; } flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } flowc = wrtod(wr); memset(flowc, 0, wr->wr_len); flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | V_FW_FLOWC_WR_NPARAMS(nparams)); flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | V_FW_WR_FLOWID(toep->tid)); #define FLOWC_PARAM(__m, __v) \ do { \ flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \ flowc->mnemval[paramidx].val = htobe32(__v); \ paramidx++; \ } while (0) paramidx = 0; FLOWC_PARAM(PFNVFN, pfvf); FLOWC_PARAM(CH, pi->tx_chan); FLOWC_PARAM(PORT, pi->tx_chan); FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id); FLOWC_PARAM(SNDBUF, toep->params.sndbuf); if (tp) { FLOWC_PARAM(MSS, toep->params.emss); FLOWC_PARAM(SNDNXT, tp->snd_nxt); FLOWC_PARAM(RCVNXT, tp->rcv_nxt); } else FLOWC_PARAM(MSS, 512); CTR6(KTR_CXGBE, "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x", __func__, toep->tid, toep->params.emss, toep->params.sndbuf, tp ? tp->snd_nxt : 0, tp ? tp->rcv_nxt : 0); if (ulp_mode(toep) == ULP_MODE_TLS) FLOWC_PARAM(ULP_MODE, ulp_mode(toep)); if (toep->tls.fcplenmax != 0) FLOWC_PARAM(TXDATAPLEN_MAX, toep->tls.fcplenmax); if (toep->params.tc_idx != -1) FLOWC_PARAM(SCHEDCLASS, toep->params.tc_idx); #undef FLOWC_PARAM KASSERT(paramidx == nparams, ("nparams mismatch")); txsd->tx_credits = howmany(flowclen, 16); txsd->plen = 0; KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, ("%s: not enough credits (%d)", __func__, toep->tx_credits)); toep->tx_credits -= txsd->tx_credits; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) toep->txsd_pidx = 0; toep->txsd_avail--; toep->flags |= TPF_FLOWC_WR_SENT; t4_wrq_tx(sc, wr); } #ifdef RATELIMIT /* * Input is Bytes/second (so_max_pacing_rate), chip counts in Kilobits/second. */ static int update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps) { int tc_idx, rc; const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000; const int port_id = toep->vi->pi->port_id; CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps); if (kbps == 0) { /* unbind */ tc_idx = -1; } else { rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx); if (rc != 0) return (rc); MPASS(tc_idx >= 0 && tc_idx < sc->chip_params->nsched_cls); } if (toep->params.tc_idx != tc_idx) { struct wrqe *wr; struct fw_flowc_wr *flowc; int nparams = 1, flowclen, flowclen16; struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); flowclen16 = howmany(flowclen, 16); if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 || (wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq)) == NULL) { if (tc_idx >= 0) t4_release_cl_rl(sc, port_id, tc_idx); return (ENOMEM); } flowc = wrtod(wr); memset(flowc, 0, wr->wr_len); flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | V_FW_FLOWC_WR_NPARAMS(nparams)); flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) | V_FW_WR_FLOWID(toep->tid)); flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS; if (tc_idx == -1) flowc->mnemval[0].val = htobe32(0xff); else flowc->mnemval[0].val = htobe32(tc_idx); txsd->tx_credits = flowclen16; txsd->plen = 0; toep->tx_credits -= txsd->tx_credits; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) toep->txsd_pidx = 0; toep->txsd_avail--; t4_wrq_tx(sc, wr); } if (toep->params.tc_idx >= 0) t4_release_cl_rl(sc, port_id, toep->params.tc_idx); toep->params.tc_idx = tc_idx; return (0); } #endif void send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt) { struct wrqe *wr; struct cpl_abort_req *req; int tid = toep->tid; struct inpcb *inp = toep->inp; struct tcpcb *tp = intotcpcb(inp); /* don't use if INP_DROPPED */ INP_WLOCK_ASSERT(inp); CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s", __func__, toep->tid, inp->inp_flags & INP_DROPPED ? "inp dropped" : tcpstates[tp->t_state], toep->flags, inp->inp_flags, toep->flags & TPF_ABORT_SHUTDOWN ? " (abort already in progress)" : ""); if (toep->flags & TPF_ABORT_SHUTDOWN) return; /* abort already in progress */ toep->flags |= TPF_ABORT_SHUTDOWN; KASSERT(toep->flags & TPF_FLOWC_WR_SENT, ("%s: flowc_wr not sent for tid %d.", __func__, tid)); wr = alloc_wrqe(sizeof(*req), toep->ofld_txq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } req = wrtod(wr); INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid); if (inp->inp_flags & INP_DROPPED) req->rsvd0 = htobe32(snd_nxt); else req->rsvd0 = htobe32(tp->snd_nxt); req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT); req->cmd = CPL_ABORT_SEND_RST; /* * XXX: What's the correct way to tell that the inp hasn't been detached * from its socket? Should I even be flushing the snd buffer here? */ if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { struct socket *so = inp->inp_socket; if (so != NULL) /* because I'm not sure. See comment above */ sbflush(&so->so_snd); } t4_l2t_send(sc, wr, toep->l2te); } /* * Called when a connection is established to translate the TCP options * reported by HW to FreeBSD's native format. */ static void assign_rxopt(struct tcpcb *tp, uint16_t opt) { struct toepcb *toep = tp->t_toe; struct inpcb *inp = tp->t_inpcb; struct adapter *sc = td_adapter(toep->td); INP_LOCK_ASSERT(inp); toep->params.mtu_idx = G_TCPOPT_MSS(opt); tp->t_maxseg = sc->params.mtus[toep->params.mtu_idx]; if (inp->inp_inc.inc_flags & INC_ISIPV6) tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr); toep->params.emss = tp->t_maxseg; if (G_TCPOPT_TSTAMP(opt)) { toep->params.tstamp = 1; toep->params.emss -= TCPOLEN_TSTAMP_APPA; tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */ tp->ts_recent = 0; /* hmmm */ tp->ts_recent_age = tcp_ts_getticks(); } else toep->params.tstamp = 0; if (G_TCPOPT_SACK(opt)) { toep->params.sack = 1; tp->t_flags |= TF_SACK_PERMIT; /* should already be set */ } else { toep->params.sack = 0; tp->t_flags &= ~TF_SACK_PERMIT; /* sack disallowed by peer */ } if (G_TCPOPT_WSCALE_OK(opt)) tp->t_flags |= TF_RCVD_SCALE; /* Doing window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == (TF_RCVD_SCALE | TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; tp->snd_scale = G_TCPOPT_SND_WSCALE(opt); } else toep->params.wscale = 0; CTR6(KTR_CXGBE, "assign_rxopt: tid %d, mtu_idx %u, emss %u, ts %u, sack %u, wscale %u", toep->tid, toep->params.mtu_idx, toep->params.emss, toep->params.tstamp, toep->params.sack, toep->params.wscale); } /* * Completes some final bits of initialization for just established connections * and changes their state to TCPS_ESTABLISHED. * * The ISNs are from the exchange of SYNs. */ void make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt) { struct inpcb *inp = toep->inp; struct socket *so = inp->inp_socket; struct tcpcb *tp = intotcpcb(inp); uint16_t tcpopt = be16toh(opt); INP_WLOCK_ASSERT(inp); KASSERT(tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED, ("%s: TCP state %s", __func__, tcpstates[tp->t_state])); CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p", __func__, toep->tid, so, inp, tp, toep); tcp_state_change(tp, TCPS_ESTABLISHED); tp->t_starttime = ticks; TCPSTAT_INC(tcps_connects); tp->irs = irs; tcp_rcvseqinit(tp); tp->rcv_wnd = (u_int)toep->params.opt0_bufsize << 10; tp->rcv_adv += tp->rcv_wnd; tp->last_ack_sent = tp->rcv_nxt; tp->iss = iss; tcp_sendseqinit(tp); tp->snd_una = iss + 1; tp->snd_nxt = iss + 1; tp->snd_max = iss + 1; assign_rxopt(tp, tcpopt); send_flowc_wr(toep, tp); soisconnected(so); } int send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits) { struct wrqe *wr; struct cpl_rx_data_ack *req; uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); KASSERT(credits >= 0, ("%s: %d credits", __func__, credits)); wr = alloc_wrqe(sizeof(*req), toep->ctrlq); if (wr == NULL) return (0); req = wrtod(wr); INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); req->credit_dack = htobe32(dack | V_RX_CREDITS(credits)); t4_wrq_tx(sc, wr); return (credits); } void send_rx_modulate(struct adapter *sc, struct toepcb *toep) { struct wrqe *wr; struct cpl_rx_data_ack *req; wr = alloc_wrqe(sizeof(*req), toep->ctrlq); if (wr == NULL) return; req = wrtod(wr); INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); req->credit_dack = htobe32(F_RX_MODULATE_RX); t4_wrq_tx(sc, wr); } void t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp) { struct adapter *sc = tod->tod_softc; struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; struct sockbuf *sb = &so->so_rcv; struct toepcb *toep = tp->t_toe; int rx_credits; INP_WLOCK_ASSERT(inp); SOCKBUF_LOCK_ASSERT(sb); rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; - if (ulp_mode(toep) == ULP_MODE_TLS) { - if (toep->tls.rcv_over >= rx_credits) { - toep->tls.rcv_over -= rx_credits; - rx_credits = 0; - } else { - rx_credits -= toep->tls.rcv_over; - toep->tls.rcv_over = 0; - } - } - if (rx_credits > 0 && (tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 || (rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) || sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) { rx_credits = send_rx_credits(sc, toep, rx_credits); tp->rcv_wnd += rx_credits; tp->rcv_adv += rx_credits; } else if (toep->flags & TPF_FORCE_CREDITS) send_rx_modulate(sc, toep); } void t4_rcvd(struct toedev *tod, struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; struct sockbuf *sb = &so->so_rcv; SOCKBUF_LOCK(sb); t4_rcvd_locked(tod, tp); SOCKBUF_UNLOCK(sb); } /* * Close a connection by sending a CPL_CLOSE_CON_REQ message. */ int t4_close_conn(struct adapter *sc, struct toepcb *toep) { struct wrqe *wr; struct cpl_close_con_req *req; unsigned int tid = toep->tid; CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid, toep->flags & TPF_FIN_SENT ? ", IGNORED" : ""); if (toep->flags & TPF_FIN_SENT) return (0); KASSERT(toep->flags & TPF_FLOWC_WR_SENT, ("%s: flowc_wr not sent for tid %u.", __func__, tid)); wr = alloc_wrqe(sizeof(*req), toep->ofld_txq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } req = wrtod(wr); req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) | V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr))); req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) | V_FW_WR_FLOWID(tid)); req->wr.wr_lo = cpu_to_be64(0); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); req->rsvd = 0; toep->flags |= TPF_FIN_SENT; toep->flags &= ~TPF_SEND_FIN; t4_l2t_send(sc, wr, toep->l2te); return (0); } #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16) #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16)) /* Maximum amount of immediate data we could stuff in a WR */ static inline int max_imm_payload(int tx_credits) { const int n = 1; /* Use no more than one desc for imm. data WR */ KASSERT(tx_credits >= 0 && tx_credits <= MAX_OFLD_TX_CREDITS, ("%s: %d credits", __func__, tx_credits)); if (tx_credits < MIN_OFLD_TX_CREDITS) return (0); if (tx_credits >= (n * EQ_ESIZE) / 16) return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr)); else return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr)); } /* Maximum number of SGL entries we could stuff in a WR */ static inline int max_dsgl_nsegs(int tx_credits) { int nseg = 1; /* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */ int sge_pair_credits = tx_credits - MIN_OFLD_TX_CREDITS; KASSERT(tx_credits >= 0 && tx_credits <= MAX_OFLD_TX_CREDITS, ("%s: %d credits", __func__, tx_credits)); if (tx_credits < MIN_OFLD_TX_CREDITS) return (0); nseg += 2 * (sge_pair_credits * 16 / 24); if ((sge_pair_credits * 16) % 24 == 16) nseg++; return (nseg); } static inline void write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen, unsigned int plen, uint8_t credits, int shove, int ulp_submode) { struct fw_ofld_tx_data_wr *txwr = dst; txwr->op_to_immdlen = htobe32(V_WR_OP(FW_OFLD_TX_DATA_WR) | V_FW_WR_IMMDLEN(immdlen)); txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) | V_FW_WR_LEN16(credits)); txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ulp_mode(toep)) | V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove)); txwr->plen = htobe32(plen); if (toep->params.tx_align > 0) { if (plen < 2 * toep->params.emss) txwr->lsodisable_to_flags |= htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE); else txwr->lsodisable_to_flags |= htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD | (toep->params.nagle == 0 ? 0 : F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE)); } } /* * Generate a DSGL from a starting mbuf. The total number of segments and the * maximum segments in any one mbuf are provided. */ static void write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n) { struct mbuf *m; struct ulptx_sgl *usgl = dst; int i, j, rc; struct sglist sg; struct sglist_seg segs[n]; KASSERT(nsegs > 0, ("%s: nsegs 0", __func__)); sglist_init(&sg, n, segs); usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | V_ULPTX_NSGE(nsegs)); i = -1; for (m = start; m != stop; m = m->m_next) { if (m->m_flags & M_EXTPG) rc = sglist_append_mbuf_epg(&sg, m, mtod(m, vm_offset_t), m->m_len); else rc = sglist_append(&sg, mtod(m, void *), m->m_len); if (__predict_false(rc != 0)) panic("%s: sglist_append %d", __func__, rc); for (j = 0; j < sg.sg_nseg; i++, j++) { if (i < 0) { usgl->len0 = htobe32(segs[j].ss_len); usgl->addr0 = htobe64(segs[j].ss_paddr); } else { usgl->sge[i / 2].len[i & 1] = htobe32(segs[j].ss_len); usgl->sge[i / 2].addr[i & 1] = htobe64(segs[j].ss_paddr); } #ifdef INVARIANTS nsegs--; #endif } sglist_reset(&sg); } if (i & 1) usgl->sge[i / 2].len[1] = htobe32(0); KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p", __func__, nsegs, start, stop)); } /* * Max number of SGL entries an offload tx work request can have. This is 41 * (1 + 40) for a full 512B work request. * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40) */ #define OFLD_SGL_LEN (41) /* * Send data and/or a FIN to the peer. * * The socket's so_snd buffer consists of a stream of data starting with sb_mb * and linked together with m_next. sb_sndptr, if set, is the last mbuf that * was transmitted. * * drop indicates the number of bytes that should be dropped from the head of * the send buffer. It is an optimization that lets do_fw4_ack avoid creating * contention on the send buffer lock (before this change it used to do * sowwakeup and then t4_push_frames right after that when recovering from tx * stalls). When drop is set this function MUST drop the bytes and wake up any * writers. */ void t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop) { struct mbuf *sndptr, *m, *sb_sndptr; struct fw_ofld_tx_data_wr *txwr; struct wrqe *wr; u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; struct inpcb *inp = toep->inp; struct tcpcb *tp = intotcpcb(inp); struct socket *so = inp->inp_socket; struct sockbuf *sb = &so->so_snd; int tx_credits, shove, compl, sowwakeup; struct ofld_tx_sdesc *txsd; bool nomap_mbuf_seen; INP_WLOCK_ASSERT(inp); KASSERT(toep->flags & TPF_FLOWC_WR_SENT, ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); KASSERT(ulp_mode(toep) == ULP_MODE_NONE || ulp_mode(toep) == ULP_MODE_TCPDDP || ulp_mode(toep) == ULP_MODE_TLS || ulp_mode(toep) == ULP_MODE_RDMA, ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); #ifdef VERBOSE_TRACES CTR5(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d", __func__, toep->tid, toep->flags, tp->t_flags, drop); #endif if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) return; #ifdef RATELIMIT if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) && (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) { inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; } #endif /* * This function doesn't resume by itself. Someone else must clear the * flag and call this function. */ if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { KASSERT(drop == 0, ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); return; } txsd = &toep->txsd[toep->txsd_pidx]; do { tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); max_imm = max_imm_payload(tx_credits); max_nsegs = max_dsgl_nsegs(tx_credits); SOCKBUF_LOCK(sb); sowwakeup = drop; if (drop) { sbdrop_locked(sb, drop); drop = 0; } sb_sndptr = sb->sb_sndptr; sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb; plen = 0; nsegs = 0; max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ nomap_mbuf_seen = false; for (m = sndptr; m != NULL; m = m->m_next) { int n; if (m->m_flags & M_EXTPG) { #ifdef KERN_TLS if (m->m_epg_tls != NULL) { toep->flags |= TPF_KTLS; if (plen == 0) { SOCKBUF_UNLOCK(sb); t4_push_ktls(sc, toep, 0); return; } break; } #endif n = sglist_count_mbuf_epg(m, mtod(m, vm_offset_t), m->m_len); } else n = sglist_count(mtod(m, void *), m->m_len); nsegs += n; plen += m->m_len; /* This mbuf sent us _over_ the nsegs limit, back out */ if (plen > max_imm && nsegs > max_nsegs) { nsegs -= n; plen -= m->m_len; if (plen == 0) { /* Too few credits */ toep->flags |= TPF_TX_SUSPENDED; if (sowwakeup) { if (!TAILQ_EMPTY( &toep->aiotx_jobq)) t4_aiotx_queue_toep(so, toep); sowwakeup_locked(so); } else SOCKBUF_UNLOCK(sb); SOCKBUF_UNLOCK_ASSERT(sb); return; } break; } if (m->m_flags & M_EXTPG) nomap_mbuf_seen = true; if (max_nsegs_1mbuf < n) max_nsegs_1mbuf = n; sb_sndptr = m; /* new sb->sb_sndptr if all goes well */ /* This mbuf put us right at the max_nsegs limit */ if (plen > max_imm && nsegs == max_nsegs) { m = m->m_next; break; } } if (sbused(sb) > sb->sb_hiwat * 5 / 8 && toep->plen_nocompl + plen >= sb->sb_hiwat / 4) compl = 1; else compl = 0; if (sb->sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf && sb->sb_hiwat < V_tcp_autosndbuf_max && sbused(sb) >= sb->sb_hiwat * 7 / 8) { int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc, V_tcp_autosndbuf_max); if (!sbreserve_locked(sb, newsize, so, NULL)) sb->sb_flags &= ~SB_AUTOSIZE; else sowwakeup = 1; /* room available */ } if (sowwakeup) { if (!TAILQ_EMPTY(&toep->aiotx_jobq)) t4_aiotx_queue_toep(so, toep); sowwakeup_locked(so); } else SOCKBUF_UNLOCK(sb); SOCKBUF_UNLOCK_ASSERT(sb); /* nothing to send */ if (plen == 0) { KASSERT(m == NULL, ("%s: nothing to send, but m != NULL", __func__)); break; } if (__predict_false(toep->flags & TPF_FIN_SENT)) panic("%s: excess tx.", __func__); shove = m == NULL && !(tp->t_flags & TF_MORETOCOME); if (plen <= max_imm && !nomap_mbuf_seen) { /* Immediate data tx */ wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), toep->ofld_txq); if (wr == NULL) { /* XXX: how will we recover from this? */ toep->flags |= TPF_TX_SUSPENDED; return; } txwr = wrtod(wr); credits = howmany(wr->wr_len, 16); write_tx_wr(txwr, toep, plen, plen, credits, shove, 0); m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); nsegs = 0; } else { int wr_len; /* DSGL tx */ wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq); if (wr == NULL) { /* XXX: how will we recover from this? */ toep->flags |= TPF_TX_SUSPENDED; return; } txwr = wrtod(wr); credits = howmany(wr_len, 16); write_tx_wr(txwr, toep, 0, plen, credits, shove, 0); write_tx_sgl(txwr + 1, sndptr, m, nsegs, max_nsegs_1mbuf); if (wr_len & 0xf) { uint64_t *pad = (uint64_t *) ((uintptr_t)txwr + wr_len); *pad = 0; } } KASSERT(toep->tx_credits >= credits, ("%s: not enough credits", __func__)); toep->tx_credits -= credits; toep->tx_nocompl += credits; toep->plen_nocompl += plen; if (toep->tx_credits <= toep->tx_total * 3 / 8 && toep->tx_nocompl >= toep->tx_total / 4) compl = 1; if (compl || ulp_mode(toep) == ULP_MODE_RDMA) { txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); toep->tx_nocompl = 0; toep->plen_nocompl = 0; } tp->snd_nxt += plen; tp->snd_max += plen; SOCKBUF_LOCK(sb); KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__)); sb->sb_sndptr = sb_sndptr; SOCKBUF_UNLOCK(sb); toep->flags |= TPF_TX_DATA_SENT; if (toep->tx_credits < MIN_OFLD_TX_CREDITS) toep->flags |= TPF_TX_SUSPENDED; KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); txsd->plen = plen; txsd->tx_credits = credits; txsd++; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { toep->txsd_pidx = 0; txsd = &toep->txsd[0]; } toep->txsd_avail--; t4_l2t_send(sc, wr, toep->l2te); } while (m != NULL); /* Send a FIN if requested, but only if there's no more data to send */ if (m == NULL && toep->flags & TPF_SEND_FIN) t4_close_conn(sc, toep); } static inline void rqdrop_locked(struct mbufq *q, int plen) { struct mbuf *m; while (plen > 0) { m = mbufq_dequeue(q); /* Too many credits. */ MPASS(m != NULL); M_ASSERTPKTHDR(m); /* Partial credits. */ MPASS(plen >= m->m_pkthdr.len); plen -= m->m_pkthdr.len; m_freem(m); } } void t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop) { struct mbuf *sndptr, *m; struct fw_ofld_tx_data_wr *txwr; struct wrqe *wr; u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; u_int adjusted_plen, ulp_submode; struct inpcb *inp = toep->inp; struct tcpcb *tp = intotcpcb(inp); int tx_credits, shove; struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; struct mbufq *pduq = &toep->ulp_pduq; static const u_int ulp_extra_len[] = {0, 4, 4, 8}; INP_WLOCK_ASSERT(inp); KASSERT(toep->flags & TPF_FLOWC_WR_SENT, ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); KASSERT(ulp_mode(toep) == ULP_MODE_ISCSI, ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) return; /* * This function doesn't resume by itself. Someone else must clear the * flag and call this function. */ if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { KASSERT(drop == 0, ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); return; } if (drop) rqdrop_locked(&toep->ulp_pdu_reclaimq, drop); while ((sndptr = mbufq_first(pduq)) != NULL) { M_ASSERTPKTHDR(sndptr); tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); max_imm = max_imm_payload(tx_credits); max_nsegs = max_dsgl_nsegs(tx_credits); plen = 0; nsegs = 0; max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ for (m = sndptr; m != NULL; m = m->m_next) { int n = sglist_count(mtod(m, void *), m->m_len); nsegs += n; plen += m->m_len; /* * This mbuf would send us _over_ the nsegs limit. * Suspend tx because the PDU can't be sent out. */ if (plen > max_imm && nsegs > max_nsegs) { toep->flags |= TPF_TX_SUSPENDED; return; } if (max_nsegs_1mbuf < n) max_nsegs_1mbuf = n; } if (__predict_false(toep->flags & TPF_FIN_SENT)) panic("%s: excess tx.", __func__); /* * We have a PDU to send. All of it goes out in one WR so 'm' * is NULL. A PDU's length is always a multiple of 4. */ MPASS(m == NULL); MPASS((plen & 3) == 0); MPASS(sndptr->m_pkthdr.len == plen); shove = !(tp->t_flags & TF_MORETOCOME); ulp_submode = mbuf_ulp_submode(sndptr); MPASS(ulp_submode < nitems(ulp_extra_len)); /* * plen doesn't include header and data digests, which are * generated and inserted in the right places by the TOE, but * they do occupy TCP sequence space and need to be accounted * for. */ adjusted_plen = plen + ulp_extra_len[ulp_submode]; if (plen <= max_imm) { /* Immediate data tx */ wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), toep->ofld_txq); if (wr == NULL) { /* XXX: how will we recover from this? */ toep->flags |= TPF_TX_SUSPENDED; return; } txwr = wrtod(wr); credits = howmany(wr->wr_len, 16); write_tx_wr(txwr, toep, plen, adjusted_plen, credits, shove, ulp_submode); m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); nsegs = 0; } else { int wr_len; /* DSGL tx */ wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq); if (wr == NULL) { /* XXX: how will we recover from this? */ toep->flags |= TPF_TX_SUSPENDED; return; } txwr = wrtod(wr); credits = howmany(wr_len, 16); write_tx_wr(txwr, toep, 0, adjusted_plen, credits, shove, ulp_submode); write_tx_sgl(txwr + 1, sndptr, m, nsegs, max_nsegs_1mbuf); if (wr_len & 0xf) { uint64_t *pad = (uint64_t *) ((uintptr_t)txwr + wr_len); *pad = 0; } } KASSERT(toep->tx_credits >= credits, ("%s: not enough credits", __func__)); m = mbufq_dequeue(pduq); MPASS(m == sndptr); mbufq_enqueue(&toep->ulp_pdu_reclaimq, m); toep->tx_credits -= credits; toep->tx_nocompl += credits; toep->plen_nocompl += plen; if (toep->tx_credits <= toep->tx_total * 3 / 8 && toep->tx_nocompl >= toep->tx_total / 4) { txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); toep->tx_nocompl = 0; toep->plen_nocompl = 0; } tp->snd_nxt += adjusted_plen; tp->snd_max += adjusted_plen; toep->flags |= TPF_TX_DATA_SENT; if (toep->tx_credits < MIN_OFLD_TX_CREDITS) toep->flags |= TPF_TX_SUSPENDED; KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); txsd->plen = plen; txsd->tx_credits = credits; txsd++; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { toep->txsd_pidx = 0; txsd = &toep->txsd[0]; } toep->txsd_avail--; t4_l2t_send(sc, wr, toep->l2te); } /* Send a FIN if requested, but only if there are no more PDUs to send */ if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN) t4_close_conn(sc, toep); } static inline void t4_push_data(struct adapter *sc, struct toepcb *toep, int drop) { if (ulp_mode(toep) == ULP_MODE_ISCSI) t4_push_pdus(sc, toep, drop); else if (tls_tx_key(toep) && toep->tls.mode == TLS_MODE_TLSOM) t4_push_tls_records(sc, toep, drop); #ifdef KERN_TLS else if (toep->flags & TPF_KTLS) t4_push_ktls(sc, toep, drop); #endif else t4_push_frames(sc, toep, drop); } int t4_tod_output(struct toedev *tod, struct tcpcb *tp) { struct adapter *sc = tod->tod_softc; #ifdef INVARIANTS struct inpcb *inp = tp->t_inpcb; #endif struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(inp); KASSERT((inp->inp_flags & INP_DROPPED) == 0, ("%s: inp %p dropped.", __func__, inp)); KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); t4_push_data(sc, toep, 0); return (0); } int t4_send_fin(struct toedev *tod, struct tcpcb *tp) { struct adapter *sc = tod->tod_softc; #ifdef INVARIANTS struct inpcb *inp = tp->t_inpcb; #endif struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(inp); KASSERT((inp->inp_flags & INP_DROPPED) == 0, ("%s: inp %p dropped.", __func__, inp)); KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); toep->flags |= TPF_SEND_FIN; if (tp->t_state >= TCPS_ESTABLISHED) t4_push_data(sc, toep, 0); return (0); } int t4_send_rst(struct toedev *tod, struct tcpcb *tp) { struct adapter *sc = tod->tod_softc; #if defined(INVARIANTS) struct inpcb *inp = tp->t_inpcb; #endif struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(inp); KASSERT((inp->inp_flags & INP_DROPPED) == 0, ("%s: inp %p dropped.", __func__, inp)); KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); /* hmmmm */ KASSERT(toep->flags & TPF_FLOWC_WR_SENT, ("%s: flowc for tid %u [%s] not sent already", __func__, toep->tid, tcpstates[tp->t_state])); send_reset(sc, toep, 0); return (0); } /* * Peer has sent us a FIN. */ static int do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_peer_close *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp = toep->inp; struct tcpcb *tp = NULL; struct socket *so; struct epoch_tracker et; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_PEER_CLOSE, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); if (__predict_false(toep->flags & TPF_SYNQE)) { /* * do_pass_establish must have run before do_peer_close and if * this is still a synqe instead of a toepcb then the connection * must be getting aborted. */ MPASS(toep->flags & TPF_ABORT_SHUTDOWN); CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, toep, toep->flags); return (0); } KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); CURVNET_SET(toep->vnet); NET_EPOCH_ENTER(et); INP_WLOCK(inp); tp = intotcpcb(inp); CTR6(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x, ddp_flags 0x%x, inp %p", __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, toep->ddp.flags, inp); if (toep->flags & TPF_ABORT_SHUTDOWN) goto done; tp->rcv_nxt++; /* FIN */ so = inp->inp_socket; socantrcvmore(so); if (ulp_mode(toep) == ULP_MODE_TCPDDP) { DDP_LOCK(toep); if (__predict_false(toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE))) handle_ddp_close(toep, tp, cpl->rcv_nxt); DDP_UNLOCK(toep); } if (ulp_mode(toep) != ULP_MODE_RDMA) { KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt), ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt, be32toh(cpl->rcv_nxt))); } switch (tp->t_state) { case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; /* FALLTHROUGH */ case TCPS_ESTABLISHED: tcp_state_change(tp, TCPS_CLOSE_WAIT); break; case TCPS_FIN_WAIT_1: tcp_state_change(tp, TCPS_CLOSING); break; case TCPS_FIN_WAIT_2: tcp_twstart(tp); INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ NET_EPOCH_EXIT(et); CURVNET_RESTORE(); INP_WLOCK(inp); final_cpl_received(toep); return (0); default: log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n", __func__, tid, tp->t_state); } done: INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } /* * Peer has ACK'd our FIN. */ static int do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp = toep->inp; struct tcpcb *tp = NULL; struct socket *so = NULL; struct epoch_tracker et; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_CLOSE_CON_RPL, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); CURVNET_SET(toep->vnet); NET_EPOCH_ENTER(et); INP_WLOCK(inp); tp = intotcpcb(inp); CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x", __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags); if (toep->flags & TPF_ABORT_SHUTDOWN) goto done; so = inp->inp_socket; tp->snd_una = be32toh(cpl->snd_nxt) - 1; /* exclude FIN */ switch (tp->t_state) { case TCPS_CLOSING: /* see TCPS_FIN_WAIT_2 in do_peer_close too */ tcp_twstart(tp); release: INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ NET_EPOCH_EXIT(et); CURVNET_RESTORE(); INP_WLOCK(inp); final_cpl_received(toep); /* no more CPLs expected */ return (0); case TCPS_LAST_ACK: if (tcp_close(tp)) INP_WUNLOCK(inp); goto release; case TCPS_FIN_WAIT_1: if (so->so_rcv.sb_state & SBS_CANTRCVMORE) soisdisconnected(so); tcp_state_change(tp, TCPS_FIN_WAIT_2); break; default: log(LOG_ERR, "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n", __func__, tid, tcpstates[tp->t_state]); } done: INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } void send_abort_rpl(struct adapter *sc, struct sge_wrq *ofld_txq, int tid, int rst_status) { struct wrqe *wr; struct cpl_abort_rpl *cpl; wr = alloc_wrqe(sizeof(*cpl), ofld_txq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } cpl = wrtod(wr); INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid); cpl->cmd = rst_status; t4_wrq_tx(sc, wr); } static int abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason) { switch (abort_reason) { case CPL_ERR_BAD_SYN: case CPL_ERR_CONN_RESET: return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); case CPL_ERR_XMIT_TIMEDOUT: case CPL_ERR_PERSIST_TIMEDOUT: case CPL_ERR_FINWAIT2_TIMEDOUT: case CPL_ERR_KEEPALIVE_TIMEDOUT: return (ETIMEDOUT); default: return (EIO); } } /* * TCP RST from the peer, timeout, or some other such critical error. */ static int do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct sge_wrq *ofld_txq = toep->ofld_txq; struct inpcb *inp; struct tcpcb *tp; struct epoch_tracker et; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_ABORT_REQ_RSS, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); if (toep->flags & TPF_SYNQE) return (do_abort_req_synqe(iq, rss, m)); KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); if (negative_advice(cpl->status)) { CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)", __func__, cpl->status, tid, toep->flags); return (0); /* Ignore negative advice */ } inp = toep->inp; CURVNET_SET(toep->vnet); NET_EPOCH_ENTER(et); /* for tcp_close */ INP_WLOCK(inp); tp = intotcpcb(inp); CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d", __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, inp->inp_flags, cpl->status); /* * If we'd initiated an abort earlier the reply to it is responsible for * cleaning up resources. Otherwise we tear everything down right here * right now. We owe the T4 a CPL_ABORT_RPL no matter what. */ if (toep->flags & TPF_ABORT_SHUTDOWN) { INP_WUNLOCK(inp); goto done; } toep->flags |= TPF_ABORT_SHUTDOWN; if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { struct socket *so = inp->inp_socket; if (so != NULL) so_error_set(so, abort_status_to_errno(tp, cpl->status)); tp = tcp_close(tp); if (tp == NULL) INP_WLOCK(inp); /* re-acquire */ } final_cpl_received(toep); done: NET_EPOCH_EXIT(et); CURVNET_RESTORE(); send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); return (0); } /* * Reply to the CPL_ABORT_REQ (send_reset) */ static int do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp = toep->inp; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_ABORT_RPL_RSS, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); if (toep->flags & TPF_SYNQE) return (do_abort_rpl_synqe(iq, rss, m)); KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d", __func__, tid, toep, inp, cpl->status); KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, ("%s: wasn't expecting abort reply", __func__)); INP_WLOCK(inp); final_cpl_received(toep); return (0); } static int do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_rx_data *cpl = mtod(m, const void *); unsigned int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp = toep->inp; struct tcpcb *tp; struct socket *so; struct sockbuf *sb; struct epoch_tracker et; int len, rx_credits; uint32_t ddp_placed = 0; if (__predict_false(toep->flags & TPF_SYNQE)) { /* * do_pass_establish must have run before do_rx_data and if this * is still a synqe instead of a toepcb then the connection must * be getting aborted. */ MPASS(toep->flags & TPF_ABORT_SHUTDOWN); CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, toep, toep->flags); m_freem(m); return (0); } KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); /* strip off CPL header */ m_adj(m, sizeof(*cpl)); len = m->m_pkthdr.len; INP_WLOCK(inp); if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) { CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", __func__, tid, len, inp->inp_flags); INP_WUNLOCK(inp); m_freem(m); return (0); } tp = intotcpcb(inp); if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq))) ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt; tp->rcv_nxt += len; if (tp->rcv_wnd < len) { KASSERT(ulp_mode(toep) == ULP_MODE_RDMA, ("%s: negative window size", __func__)); } tp->rcv_wnd -= len; tp->t_rcvtime = ticks; if (ulp_mode(toep) == ULP_MODE_TCPDDP) DDP_LOCK(toep); so = inp_inpcbtosocket(inp); sb = &so->so_rcv; SOCKBUF_LOCK(sb); if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) { CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)", __func__, tid, len); m_freem(m); SOCKBUF_UNLOCK(sb); if (ulp_mode(toep) == ULP_MODE_TCPDDP) DDP_UNLOCK(toep); INP_WUNLOCK(inp); CURVNET_SET(toep->vnet); NET_EPOCH_ENTER(et); INP_WLOCK(inp); tp = tcp_drop(tp, ECONNRESET); if (tp) INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } /* receive buffer autosize */ MPASS(toep->vnet == so->so_vnet); CURVNET_SET(toep->vnet); if (sb->sb_flags & SB_AUTOSIZE && V_tcp_do_autorcvbuf && sb->sb_hiwat < V_tcp_autorcvbuf_max && len > (sbspace(sb) / 8 * 7)) { unsigned int hiwat = sb->sb_hiwat; unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc, V_tcp_autorcvbuf_max); if (!sbreserve_locked(sb, newsize, so, NULL)) sb->sb_flags &= ~SB_AUTOSIZE; } if (ulp_mode(toep) == ULP_MODE_TCPDDP) { int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off; if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0) CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)", __func__, tid, len); if (changed) { if (toep->ddp.flags & DDP_SC_REQ) toep->ddp.flags ^= DDP_ON | DDP_SC_REQ; else { KASSERT(cpl->ddp_off == 1, ("%s: DDP switched on by itself.", __func__)); /* Fell out of DDP mode */ toep->ddp.flags &= ~DDP_ON; CTR1(KTR_CXGBE, "%s: fell out of DDP mode", __func__); insert_ddp_data(toep, ddp_placed); } } if (toep->ddp.flags & DDP_ON) { /* * CPL_RX_DATA with DDP on can only be an indicate. * Start posting queued AIO requests via DDP. The * payload that arrived in this indicate is appended * to the socket buffer as usual. */ handle_ddp_indicate(toep); } } sbappendstream_locked(sb, m, 0); rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) { rx_credits = send_rx_credits(sc, toep, rx_credits); tp->rcv_wnd += rx_credits; tp->rcv_adv += rx_credits; } if (ulp_mode(toep) == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 && sbavail(sb) != 0) { CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__, tid); ddp_queue_toep(toep); } sorwakeup_locked(so); SOCKBUF_UNLOCK_ASSERT(sb); if (ulp_mode(toep) == ULP_MODE_TCPDDP) DDP_UNLOCK(toep); INP_WUNLOCK(inp); CURVNET_RESTORE(); return (0); } static int do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp; struct tcpcb *tp; struct socket *so; uint8_t credits = cpl->credits; struct ofld_tx_sdesc *txsd; int plen; #ifdef INVARIANTS unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl))); #endif /* * Very unusual case: we'd sent a flowc + abort_req for a synq entry and * now this comes back carrying the credits for the flowc. */ if (__predict_false(toep->flags & TPF_SYNQE)) { KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, ("%s: credits for a synq entry %p", __func__, toep)); return (0); } inp = toep->inp; KASSERT(opcode == CPL_FW4_ACK, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); INP_WLOCK(inp); if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) { INP_WUNLOCK(inp); return (0); } KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0, ("%s: inp_flags 0x%x", __func__, inp->inp_flags)); tp = intotcpcb(inp); if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) { tcp_seq snd_una = be32toh(cpl->snd_una); #ifdef INVARIANTS if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { log(LOG_ERR, "%s: unexpected seq# %x for TID %u, snd_una %x\n", __func__, snd_una, toep->tid, tp->snd_una); } #endif if (tp->snd_una != snd_una) { tp->snd_una = snd_una; tp->ts_recent_age = tcp_ts_getticks(); } } #ifdef VERBOSE_TRACES CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits); #endif so = inp->inp_socket; txsd = &toep->txsd[toep->txsd_cidx]; plen = 0; while (credits) { KASSERT(credits >= txsd->tx_credits, ("%s: too many (or partial) credits", __func__)); credits -= txsd->tx_credits; toep->tx_credits += txsd->tx_credits; plen += txsd->plen; if (txsd->iv_buffer) { free(txsd->iv_buffer, M_CXGBE); txsd->iv_buffer = NULL; } txsd++; toep->txsd_avail++; KASSERT(toep->txsd_avail <= toep->txsd_total, ("%s: txsd avail > total", __func__)); if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) { txsd = &toep->txsd[0]; toep->txsd_cidx = 0; } } if (toep->tx_credits == toep->tx_total) { toep->tx_nocompl = 0; toep->plen_nocompl = 0; } if (toep->flags & TPF_TX_SUSPENDED && toep->tx_credits >= toep->tx_total / 4) { #ifdef VERBOSE_TRACES CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__, tid); #endif toep->flags &= ~TPF_TX_SUSPENDED; CURVNET_SET(toep->vnet); t4_push_data(sc, toep, plen); CURVNET_RESTORE(); } else if (plen > 0) { struct sockbuf *sb = &so->so_snd; int sbu; SOCKBUF_LOCK(sb); sbu = sbused(sb); if (ulp_mode(toep) == ULP_MODE_ISCSI) { if (__predict_false(sbu > 0)) { /* * The data trasmitted before the tid's ULP mode * changed to ISCSI is still in so_snd. * Incoming credits should account for so_snd * first. */ sbdrop_locked(sb, min(sbu, plen)); plen -= min(sbu, plen); } sowwakeup_locked(so); /* unlocks so_snd */ rqdrop_locked(&toep->ulp_pdu_reclaimq, plen); } else { #ifdef VERBOSE_TRACES CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__, tid, plen); #endif sbdrop_locked(sb, plen); if (tls_tx_key(toep) && toep->tls.mode == TLS_MODE_TLSOM) { struct tls_ofld_info *tls_ofld = &toep->tls; MPASS(tls_ofld->sb_off >= plen); tls_ofld->sb_off -= plen; } if (!TAILQ_EMPTY(&toep->aiotx_jobq)) t4_aiotx_queue_toep(so, toep); sowwakeup_locked(so); /* unlocks so_snd */ } SOCKBUF_UNLOCK_ASSERT(sb); } INP_WUNLOCK(inp); return (0); } void t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie) { struct wrqe *wr; struct cpl_set_tcb_field *req; struct ofld_tx_sdesc *txsd; MPASS((cookie & ~M_COOKIE) == 0); if (reply) { MPASS(cookie != CPL_COOKIE_RESERVED); } wr = alloc_wrqe(sizeof(*req), wrq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } req = wrtod(wr); INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid); req->reply_ctrl = htobe16(V_QUEUENO(toep->ofld_rxq->iq.abs_id)); if (reply == 0) req->reply_ctrl |= htobe16(F_NO_REPLY); req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie)); req->mask = htobe64(mask); req->val = htobe64(val); if ((wrq->eq.flags & EQ_TYPEMASK) == EQ_OFLD) { txsd = &toep->txsd[toep->txsd_pidx]; txsd->tx_credits = howmany(sizeof(*req), 16); txsd->plen = 0; KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, ("%s: not enough credits (%d)", __func__, toep->tx_credits)); toep->tx_credits -= txsd->tx_credits; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) toep->txsd_pidx = 0; toep->txsd_avail--; } t4_wrq_tx(sc, wr); } void t4_init_cpl_io_handlers(void) { t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl, CPL_COOKIE_TOM); t4_register_cpl_handler(CPL_RX_DATA, do_rx_data); t4_register_shared_cpl_handler(CPL_FW4_ACK, do_fw4_ack, CPL_COOKIE_TOM); } void t4_uninit_cpl_io_handlers(void) { t4_register_cpl_handler(CPL_PEER_CLOSE, NULL); t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL); t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL); t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, NULL, CPL_COOKIE_TOM); t4_register_cpl_handler(CPL_RX_DATA, NULL); t4_register_shared_cpl_handler(CPL_FW4_ACK, NULL, CPL_COOKIE_TOM); } /* * Use the 'backend1' field in AIO jobs to hold an error that should * be reported when the job is completed, the 'backend3' field to * store the amount of data sent by the AIO job so far, and the * 'backend4' field to hold a reference count on the job. * * Each unmapped mbuf holds a reference on the job as does the queue * so long as the job is queued. */ #define aio_error backend1 #define aio_sent backend3 #define aio_refs backend4 #define jobtotid(job) \ (((struct toepcb *)(so_sototcpcb((job)->fd_file->f_data)->t_toe))->tid) static void aiotx_free_job(struct kaiocb *job) { long status; int error; if (refcount_release(&job->aio_refs) == 0) return; error = (intptr_t)job->aio_error; status = job->aio_sent; #ifdef VERBOSE_TRACES CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__, jobtotid(job), job, status, error); #endif if (error != 0 && status != 0) error = 0; if (error == ECANCELED) aio_cancel(job); else if (error) aio_complete(job, -1, error); else { job->msgsnd = 1; aio_complete(job, status, 0); } } static void aiotx_free_pgs(struct mbuf *m) { struct kaiocb *job; vm_page_t pg; M_ASSERTEXTPG(m); job = m->m_ext.ext_arg1; #ifdef VERBOSE_TRACES CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__, m->m_len, jobtotid(job)); #endif for (int i = 0; i < m->m_epg_npgs; i++) { pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); vm_page_unwire(pg, PQ_ACTIVE); } aiotx_free_job(job); } /* * Allocate a chain of unmapped mbufs describing the next 'len' bytes * of an AIO job. */ static struct mbuf * alloc_aiotx_mbuf(struct kaiocb *job, int len) { struct vmspace *vm; vm_page_t pgs[MBUF_PEXT_MAX_PGS]; struct mbuf *m, *top, *last; vm_map_t map; vm_offset_t start; int i, mlen, npages, pgoff; KASSERT(job->aio_sent + len <= job->uaiocb.aio_nbytes, ("%s(%p, %d): request to send beyond end of buffer", __func__, job, len)); /* * The AIO subsystem will cancel and drain all requests before * permitting a process to exit or exec, so p_vmspace should * be stable here. */ vm = job->userproc->p_vmspace; map = &vm->vm_map; start = (uintptr_t)job->uaiocb.aio_buf + job->aio_sent; pgoff = start & PAGE_MASK; top = NULL; last = NULL; while (len > 0) { mlen = imin(len, MBUF_PEXT_MAX_PGS * PAGE_SIZE - pgoff); KASSERT(mlen == len || ((start + mlen) & PAGE_MASK) == 0, ("%s: next start (%#jx + %#x) is not page aligned", __func__, (uintmax_t)start, mlen)); npages = vm_fault_quick_hold_pages(map, start, mlen, VM_PROT_WRITE, pgs, nitems(pgs)); if (npages < 0) break; m = mb_alloc_ext_pgs(M_WAITOK, aiotx_free_pgs); if (m == NULL) { vm_page_unhold_pages(pgs, npages); break; } m->m_epg_1st_off = pgoff; m->m_epg_npgs = npages; if (npages == 1) { KASSERT(mlen + pgoff <= PAGE_SIZE, ("%s: single page is too large (off %d len %d)", __func__, pgoff, mlen)); m->m_epg_last_len = mlen; } else { m->m_epg_last_len = mlen - (PAGE_SIZE - pgoff) - (npages - 2) * PAGE_SIZE; } for (i = 0; i < npages; i++) m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pgs[i]); m->m_len = mlen; m->m_ext.ext_size = npages * PAGE_SIZE; m->m_ext.ext_arg1 = job; refcount_acquire(&job->aio_refs); #ifdef VERBOSE_TRACES CTR5(KTR_CXGBE, "%s: tid %d, new mbuf %p for job %p, npages %d", __func__, jobtotid(job), m, job, npages); #endif if (top == NULL) top = m; else last->m_next = m; last = m; len -= mlen; start += mlen; pgoff = 0; } return (top); } static void t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job) { struct sockbuf *sb; struct file *fp; struct inpcb *inp; struct tcpcb *tp; struct mbuf *m; int error, len; bool moretocome, sendmore; sb = &so->so_snd; SOCKBUF_UNLOCK(sb); fp = job->fd_file; m = NULL; #ifdef MAC error = mac_socket_check_send(fp->f_cred, so); if (error != 0) goto out; #endif /* Inline sosend_generic(). */ error = sblock(sb, SBL_WAIT); MPASS(error == 0); sendanother: SOCKBUF_LOCK(sb); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(sb); sbunlock(sb); if ((so->so_options & SO_NOSIGPIPE) == 0) { PROC_LOCK(job->userproc); kern_psignal(job->userproc, SIGPIPE); PROC_UNLOCK(job->userproc); } error = EPIPE; goto out; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(sb); sbunlock(sb); goto out; } if ((so->so_state & SS_ISCONNECTED) == 0) { SOCKBUF_UNLOCK(sb); sbunlock(sb); error = ENOTCONN; goto out; } if (sbspace(sb) < sb->sb_lowat) { MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO)); /* * Don't block if there is too little room in the socket * buffer. Instead, requeue the request. */ if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { SOCKBUF_UNLOCK(sb); sbunlock(sb); error = ECANCELED; goto out; } TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); SOCKBUF_UNLOCK(sb); sbunlock(sb); goto out; } /* * Write as much data as the socket permits, but no more than a * a single sndbuf at a time. */ len = sbspace(sb); if (len > job->uaiocb.aio_nbytes - job->aio_sent) { len = job->uaiocb.aio_nbytes - job->aio_sent; moretocome = false; } else moretocome = true; if (len > toep->params.sndbuf) { len = toep->params.sndbuf; sendmore = true; } else sendmore = false; if (!TAILQ_EMPTY(&toep->aiotx_jobq)) moretocome = true; SOCKBUF_UNLOCK(sb); MPASS(len != 0); m = alloc_aiotx_mbuf(job, len); if (m == NULL) { sbunlock(sb); error = EFAULT; goto out; } /* Inlined tcp_usr_send(). */ inp = toep->inp; INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); sbunlock(sb); error = ECONNRESET; goto out; } job->aio_sent += m_length(m, NULL); sbappendstream(sb, m, 0); m = NULL; if (!(inp->inp_flags & INP_DROPPED)) { tp = intotcpcb(inp); if (moretocome) tp->t_flags |= TF_MORETOCOME; error = tp->t_fb->tfb_tcp_output(tp); if (moretocome) tp->t_flags &= ~TF_MORETOCOME; } INP_WUNLOCK(inp); if (sendmore) goto sendanother; sbunlock(sb); if (error) goto out; /* * If this is a blocking socket and the request has not been * fully completed, requeue it until the socket is ready * again. */ if (job->aio_sent < job->uaiocb.aio_nbytes && !(so->so_state & SS_NBIO)) { SOCKBUF_LOCK(sb); if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { SOCKBUF_UNLOCK(sb); error = ECANCELED; goto out; } TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); return; } /* * If the request will not be requeued, drop the queue's * reference to the job. Any mbufs in flight should still * hold a reference, but this drops the reference that the * queue owns while it is waiting to queue mbufs to the * socket. */ aiotx_free_job(job); out: if (error) { job->aio_error = (void *)(intptr_t)error; aiotx_free_job(job); } if (m != NULL) m_free(m); SOCKBUF_LOCK(sb); } static void t4_aiotx_task(void *context, int pending) { struct toepcb *toep = context; struct socket *so; struct kaiocb *job; so = toep->aiotx_so; CURVNET_SET(toep->vnet); SOCKBUF_LOCK(&so->so_snd); while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) { job = TAILQ_FIRST(&toep->aiotx_jobq); TAILQ_REMOVE(&toep->aiotx_jobq, job, list); if (!aio_clear_cancel_function(job)) continue; t4_aiotx_process_job(toep, so, job); } toep->aiotx_so = NULL; SOCKBUF_UNLOCK(&so->so_snd); CURVNET_RESTORE(); free_toepcb(toep); SOCK_LOCK(so); sorele(so); } static void t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep) { SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd); #ifdef VERBOSE_TRACES CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s", __func__, toep->tid, toep->aiotx_so != NULL ? "true" : "false"); #endif if (toep->aiotx_so != NULL) return; soref(so); toep->aiotx_so = so; hold_toepcb(toep); soaio_enqueue(&toep->aiotx_task); } static void t4_aiotx_cancel(struct kaiocb *job) { struct socket *so; struct sockbuf *sb; struct tcpcb *tp; struct toepcb *toep; so = job->fd_file->f_data; tp = so_sototcpcb(so); toep = tp->t_toe; MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE); sb = &so->so_snd; SOCKBUF_LOCK(sb); if (!aio_cancel_cleared(job)) TAILQ_REMOVE(&toep->aiotx_jobq, job, list); SOCKBUF_UNLOCK(sb); job->aio_error = (void *)(intptr_t)ECANCELED; aiotx_free_job(job); } int t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job) { struct tcpcb *tp = so_sototcpcb(so); struct toepcb *toep = tp->t_toe; struct adapter *sc = td_adapter(toep->td); /* This only handles writes. */ if (job->uaiocb.aio_lio_opcode != LIO_WRITE) return (EOPNOTSUPP); if (!sc->tt.tx_zcopy) return (EOPNOTSUPP); if (tls_tx_key(toep)) return (EOPNOTSUPP); SOCKBUF_LOCK(&so->so_snd); #ifdef VERBOSE_TRACES CTR3(KTR_CXGBE, "%s: queueing %p for tid %u", __func__, job, toep->tid); #endif if (!aio_set_cancel_function(job, t4_aiotx_cancel)) panic("new job was cancelled"); refcount_init(&job->aio_refs, 1); TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list); if (sowriteable(so)) t4_aiotx_queue_toep(so, toep); SOCKBUF_UNLOCK(&so->so_snd); return (0); } void aiotx_init_toep(struct toepcb *toep) { TAILQ_INIT(&toep->aiotx_jobq); TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep); } #endif Index: head/sys/dev/cxgbe/tom/t4_tls.c =================================================================== --- head/sys/dev/cxgbe/tom/t4_tls.c (revision 366853) +++ head/sys/dev/cxgbe/tom/t4_tls.c (revision 366854) @@ -1,2242 +1,2240 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2017-2018 Chelsio Communications, Inc. * All rights reserved. * Written by: John Baldwin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_inet.h" #include "opt_kern_tls.h" #include __FBSDID("$FreeBSD$"); #include #include #ifdef KERN_TLS #include #endif #include #include #include #include #include #include #include #include #ifdef KERN_TLS #include #include #endif #ifdef TCP_OFFLOAD #include "common/common.h" #include "common/t4_tcb.h" #include "crypto/t4_crypto.h" #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" /* * The TCP sequence number of a CPL_TLS_DATA mbuf is saved here while * the mbuf is in the ulp_pdu_reclaimq. */ #define tls_tcp_seq PH_loc.thirtytwo[0] /* * Handshake lock used for the handshake timer. Having a global lock * is perhaps not ideal, but it avoids having to use callout_drain() * in tls_uninit_toep() which can't block. Also, the timer shouldn't * actually fire for most connections. */ static struct mtx tls_handshake_lock; static void t4_set_tls_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val) { struct adapter *sc = td_adapter(toep->td); t4_set_tcb_field(sc, toep->ofld_txq, toep, word, mask, val, 0, 0); } /* TLS and DTLS common routines */ bool can_tls_offload(struct adapter *sc) { return (sc->tt.tls && sc->cryptocaps & FW_CAPS_CONFIG_TLSKEYS); } int tls_tx_key(struct toepcb *toep) { struct tls_ofld_info *tls_ofld = &toep->tls; return (tls_ofld->tx_key_addr >= 0); } int tls_rx_key(struct toepcb *toep) { struct tls_ofld_info *tls_ofld = &toep->tls; return (tls_ofld->rx_key_addr >= 0); } static int key_size(struct toepcb *toep) { struct tls_ofld_info *tls_ofld = &toep->tls; return ((tls_ofld->key_location == TLS_SFO_WR_CONTEXTLOC_IMMEDIATE) ? tls_ofld->k_ctx.tx_key_info_size : KEY_IN_DDR_SIZE); } /* Set TLS Key-Id in TCB */ static void t4_set_tls_keyid(struct toepcb *toep, unsigned int key_id) { t4_set_tls_tcb_field(toep, W_TCB_RX_TLS_KEY_TAG, V_TCB_RX_TLS_KEY_TAG(M_TCB_RX_TLS_BUF_TAG), V_TCB_RX_TLS_KEY_TAG(key_id)); } /* Clear TF_RX_QUIESCE to re-enable receive. */ static void t4_clear_rx_quiesce(struct toepcb *toep) { t4_set_tls_tcb_field(toep, W_TCB_T_FLAGS, V_TF_RX_QUIESCE(1), 0); } static void tls_clr_ofld_mode(struct toepcb *toep) { tls_stop_handshake_timer(toep); /* Operate in PDU extraction mode only. */ t4_set_tls_tcb_field(toep, W_TCB_ULP_RAW, V_TCB_ULP_RAW(M_TCB_ULP_RAW), V_TCB_ULP_RAW(V_TF_TLS_ENABLE(1))); t4_clear_rx_quiesce(toep); } static void tls_clr_quiesce(struct toepcb *toep) { tls_stop_handshake_timer(toep); t4_clear_rx_quiesce(toep); } /* * Calculate the TLS data expansion size */ static int tls_expansion_size(struct toepcb *toep, int data_len, int full_pdus_only, unsigned short *pdus_per_ulp) { struct tls_ofld_info *tls_ofld = &toep->tls; struct tls_scmd *scmd = &tls_ofld->scmd0; int expn_size = 0, frag_count = 0, pad_per_pdu = 0, pad_last_pdu = 0, last_frag_size = 0, max_frag_size = 0; int exp_per_pdu = 0; int hdr_len = TLS_HEADER_LENGTH; do { max_frag_size = tls_ofld->k_ctx.frag_size; if (G_SCMD_CIPH_MODE(scmd->seqno_numivs) == SCMD_CIPH_MODE_AES_GCM) { frag_count = (data_len / max_frag_size); exp_per_pdu = GCM_TAG_SIZE + AEAD_EXPLICIT_DATA_SIZE + hdr_len; expn_size = frag_count * exp_per_pdu; if (full_pdus_only) { *pdus_per_ulp = data_len / (exp_per_pdu + max_frag_size); if (*pdus_per_ulp > 32) *pdus_per_ulp = 32; else if(!*pdus_per_ulp) *pdus_per_ulp = 1; expn_size = (*pdus_per_ulp) * exp_per_pdu; break; } if ((last_frag_size = data_len % max_frag_size) > 0) { frag_count += 1; expn_size += exp_per_pdu; } break; } else if (G_SCMD_CIPH_MODE(scmd->seqno_numivs) != SCMD_CIPH_MODE_NOP) { /* Calculate the number of fragments we can make */ frag_count = (data_len / max_frag_size); if (frag_count > 0) { pad_per_pdu = (((howmany((max_frag_size + tls_ofld->mac_length), CIPHER_BLOCK_SIZE)) * CIPHER_BLOCK_SIZE) - (max_frag_size + tls_ofld->mac_length)); if (!pad_per_pdu) pad_per_pdu = CIPHER_BLOCK_SIZE; exp_per_pdu = pad_per_pdu + tls_ofld->mac_length + hdr_len + CIPHER_BLOCK_SIZE; expn_size = frag_count * exp_per_pdu; } if (full_pdus_only) { *pdus_per_ulp = data_len / (exp_per_pdu + max_frag_size); if (*pdus_per_ulp > 32) *pdus_per_ulp = 32; else if (!*pdus_per_ulp) *pdus_per_ulp = 1; expn_size = (*pdus_per_ulp) * exp_per_pdu; break; } /* Consider the last fragment */ if ((last_frag_size = data_len % max_frag_size) > 0) { pad_last_pdu = (((howmany((last_frag_size + tls_ofld->mac_length), CIPHER_BLOCK_SIZE)) * CIPHER_BLOCK_SIZE) - (last_frag_size + tls_ofld->mac_length)); if (!pad_last_pdu) pad_last_pdu = CIPHER_BLOCK_SIZE; expn_size += (pad_last_pdu + tls_ofld->mac_length + hdr_len + CIPHER_BLOCK_SIZE); } } } while (0); return (expn_size); } /* Copy Key to WR */ static void tls_copy_tx_key(struct toepcb *toep, void *dst) { struct tls_ofld_info *tls_ofld = &toep->tls; struct ulptx_sc_memrd *sc_memrd; struct ulptx_idata *sc; if (tls_ofld->k_ctx.tx_key_info_size <= 0) return; if (tls_ofld->key_location == TLS_SFO_WR_CONTEXTLOC_DDR) { sc = dst; sc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); sc->len = htobe32(0); sc_memrd = (struct ulptx_sc_memrd *)(sc + 1); sc_memrd->cmd_to_len = htobe32(V_ULPTX_CMD(ULP_TX_SC_MEMRD) | V_ULP_TX_SC_MORE(1) | V_ULPTX_LEN16(tls_ofld->k_ctx.tx_key_info_size >> 4)); sc_memrd->addr = htobe32(tls_ofld->tx_key_addr >> 5); } else if (tls_ofld->key_location == TLS_SFO_WR_CONTEXTLOC_IMMEDIATE) { memcpy(dst, &tls_ofld->k_ctx.tx, tls_ofld->k_ctx.tx_key_info_size); } } /* TLS/DTLS content type for CPL SFO */ static inline unsigned char tls_content_type(unsigned char content_type) { /* * XXX: Shouldn't this map CONTENT_TYPE_APP_DATA to DATA and * default to "CUSTOM" for all other types including * heartbeat? */ switch (content_type) { case CONTENT_TYPE_CCS: return CPL_TX_TLS_SFO_TYPE_CCS; case CONTENT_TYPE_ALERT: return CPL_TX_TLS_SFO_TYPE_ALERT; case CONTENT_TYPE_HANDSHAKE: return CPL_TX_TLS_SFO_TYPE_HANDSHAKE; case CONTENT_TYPE_HEARTBEAT: return CPL_TX_TLS_SFO_TYPE_HEARTBEAT; } return CPL_TX_TLS_SFO_TYPE_DATA; } static unsigned char get_cipher_key_size(unsigned int ck_size) { switch (ck_size) { case AES_NOP: /* NOP */ return 15; case AES_128: /* AES128 */ return CH_CK_SIZE_128; case AES_192: /* AES192 */ return CH_CK_SIZE_192; case AES_256: /* AES256 */ return CH_CK_SIZE_256; default: return CH_CK_SIZE_256; } } static unsigned char get_mac_key_size(unsigned int mk_size) { switch (mk_size) { case SHA_NOP: /* NOP */ return CH_MK_SIZE_128; case SHA_GHASH: /* GHASH */ case SHA_512: /* SHA512 */ return CH_MK_SIZE_512; case SHA_224: /* SHA2-224 */ return CH_MK_SIZE_192; case SHA_256: /* SHA2-256*/ return CH_MK_SIZE_256; case SHA_384: /* SHA384 */ return CH_MK_SIZE_512; case SHA1: /* SHA1 */ default: return CH_MK_SIZE_160; } } static unsigned int get_proto_ver(int proto_ver) { switch (proto_ver) { case TLS1_2_VERSION: return TLS_1_2_VERSION; case TLS1_1_VERSION: return TLS_1_1_VERSION; case DTLS1_2_VERSION: return DTLS_1_2_VERSION; default: return TLS_VERSION_MAX; } } static void tls_rxkey_flit1(struct tls_keyctx *kwr, struct tls_key_context *kctx) { if (kctx->state.enc_mode == CH_EVP_CIPH_GCM_MODE) { kwr->u.rxhdr.ivinsert_to_authinsrt = htobe64(V_TLS_KEYCTX_TX_WR_IVINSERT(6ULL) | V_TLS_KEYCTX_TX_WR_AADSTRTOFST(1ULL) | V_TLS_KEYCTX_TX_WR_AADSTOPOFST(5ULL) | V_TLS_KEYCTX_TX_WR_AUTHSRTOFST(14ULL) | V_TLS_KEYCTX_TX_WR_AUTHSTOPOFST(16ULL) | V_TLS_KEYCTX_TX_WR_CIPHERSRTOFST(14ULL) | V_TLS_KEYCTX_TX_WR_CIPHERSTOPOFST(0ULL) | V_TLS_KEYCTX_TX_WR_AUTHINSRT(16ULL)); kwr->u.rxhdr.ivpresent_to_rxmk_size &= ~(V_TLS_KEYCTX_TX_WR_RXOPAD_PRESENT(1)); kwr->u.rxhdr.authmode_to_rxvalid &= ~(V_TLS_KEYCTX_TX_WR_CIPHAUTHSEQCTRL(1)); } else { kwr->u.rxhdr.ivinsert_to_authinsrt = htobe64(V_TLS_KEYCTX_TX_WR_IVINSERT(6ULL) | V_TLS_KEYCTX_TX_WR_AADSTRTOFST(1ULL) | V_TLS_KEYCTX_TX_WR_AADSTOPOFST(5ULL) | V_TLS_KEYCTX_TX_WR_AUTHSRTOFST(22ULL) | V_TLS_KEYCTX_TX_WR_AUTHSTOPOFST(0ULL) | V_TLS_KEYCTX_TX_WR_CIPHERSRTOFST(22ULL) | V_TLS_KEYCTX_TX_WR_CIPHERSTOPOFST(0ULL) | V_TLS_KEYCTX_TX_WR_AUTHINSRT(0ULL)); } } /* Rx key */ static void prepare_rxkey_wr(struct tls_keyctx *kwr, struct tls_key_context *kctx) { unsigned int ck_size = kctx->cipher_secret_size; unsigned int mk_size = kctx->mac_secret_size; int proto_ver = kctx->proto_ver; kwr->u.rxhdr.flitcnt_hmacctrl = ((kctx->rx_key_info_size >> 4) << 3) | kctx->hmac_ctrl; kwr->u.rxhdr.protover_ciphmode = V_TLS_KEYCTX_TX_WR_PROTOVER(get_proto_ver(proto_ver)) | V_TLS_KEYCTX_TX_WR_CIPHMODE(kctx->state.enc_mode); kwr->u.rxhdr.authmode_to_rxvalid = V_TLS_KEYCTX_TX_WR_AUTHMODE(kctx->state.auth_mode) | V_TLS_KEYCTX_TX_WR_CIPHAUTHSEQCTRL(1) | V_TLS_KEYCTX_TX_WR_SEQNUMCTRL(3) | V_TLS_KEYCTX_TX_WR_RXVALID(1); kwr->u.rxhdr.ivpresent_to_rxmk_size = V_TLS_KEYCTX_TX_WR_IVPRESENT(0) | V_TLS_KEYCTX_TX_WR_RXOPAD_PRESENT(1) | V_TLS_KEYCTX_TX_WR_RXCK_SIZE(get_cipher_key_size(ck_size)) | V_TLS_KEYCTX_TX_WR_RXMK_SIZE(get_mac_key_size(mk_size)); tls_rxkey_flit1(kwr, kctx); /* No key reversal for GCM */ if (kctx->state.enc_mode != CH_EVP_CIPH_GCM_MODE) { t4_aes_getdeckey(kwr->keys.edkey, kctx->rx.key, (kctx->cipher_secret_size << 3)); memcpy(kwr->keys.edkey + kctx->cipher_secret_size, kctx->rx.key + kctx->cipher_secret_size, (IPAD_SIZE + OPAD_SIZE)); } else { memcpy(kwr->keys.edkey, kctx->rx.key, (kctx->rx_key_info_size - SALT_SIZE)); memcpy(kwr->u.rxhdr.rxsalt, kctx->rx.salt, SALT_SIZE); } } /* Tx key */ static void prepare_txkey_wr(struct tls_keyctx *kwr, struct tls_key_context *kctx) { unsigned int ck_size = kctx->cipher_secret_size; unsigned int mk_size = kctx->mac_secret_size; kwr->u.txhdr.ctxlen = (kctx->tx_key_info_size >> 4); kwr->u.txhdr.dualck_to_txvalid = V_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT(1) | V_TLS_KEYCTX_TX_WR_SALT_PRESENT(1) | V_TLS_KEYCTX_TX_WR_TXCK_SIZE(get_cipher_key_size(ck_size)) | V_TLS_KEYCTX_TX_WR_TXMK_SIZE(get_mac_key_size(mk_size)) | V_TLS_KEYCTX_TX_WR_TXVALID(1); memcpy(kwr->keys.edkey, kctx->tx.key, HDR_KCTX_SIZE); if (kctx->state.enc_mode == CH_EVP_CIPH_GCM_MODE) { memcpy(kwr->u.txhdr.txsalt, kctx->tx.salt, SALT_SIZE); kwr->u.txhdr.dualck_to_txvalid &= ~(V_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT(1)); } kwr->u.txhdr.dualck_to_txvalid = htons(kwr->u.txhdr.dualck_to_txvalid); } /* TLS Key memory management */ static int get_new_keyid(struct toepcb *toep) { struct adapter *sc = td_adapter(toep->td); vmem_addr_t addr; if (vmem_alloc(sc->key_map, TLS_KEY_CONTEXT_SZ, M_NOWAIT | M_FIRSTFIT, &addr) != 0) return (-1); return (addr); } static void free_keyid(struct toepcb *toep, int keyid) { struct adapter *sc = td_adapter(toep->td); vmem_free(sc->key_map, keyid, TLS_KEY_CONTEXT_SZ); } static void clear_tls_keyid(struct toepcb *toep) { struct tls_ofld_info *tls_ofld = &toep->tls; if (tls_ofld->rx_key_addr >= 0) { free_keyid(toep, tls_ofld->rx_key_addr); tls_ofld->rx_key_addr = -1; } if (tls_ofld->tx_key_addr >= 0) { free_keyid(toep, tls_ofld->tx_key_addr); tls_ofld->tx_key_addr = -1; } } static int get_keyid(struct tls_ofld_info *tls_ofld, unsigned int ops) { return (ops & KEY_WRITE_RX ? tls_ofld->rx_key_addr : ((ops & KEY_WRITE_TX) ? tls_ofld->tx_key_addr : -1)); } static int get_tp_plen_max(struct tls_ofld_info *tls_ofld) { int plen = ((min(3*4096, TP_TX_PG_SZ))/1448) * 1448; return (tls_ofld->k_ctx.frag_size <= 8192 ? plen : FC_TP_PLEN_MAX); } /* Send request to get the key-id */ static int tls_program_key_id(struct toepcb *toep, struct tls_key_context *k_ctx) { struct tls_ofld_info *tls_ofld = &toep->tls; struct adapter *sc = td_adapter(toep->td); struct ofld_tx_sdesc *txsd; int kwrlen, kctxlen, keyid, len; struct wrqe *wr; struct tls_key_req *kwr; struct tls_keyctx *kctx; kwrlen = sizeof(*kwr); kctxlen = roundup2(sizeof(*kctx), 32); len = roundup2(kwrlen + kctxlen, 16); if (toep->txsd_avail == 0) return (EAGAIN); /* Dont initialize key for re-neg */ if (!G_KEY_CLR_LOC(k_ctx->l_p_key)) { if ((keyid = get_new_keyid(toep)) < 0) { return (ENOSPC); } } else { keyid = get_keyid(tls_ofld, k_ctx->l_p_key); } wr = alloc_wrqe(len, toep->ofld_txq); if (wr == NULL) { free_keyid(toep, keyid); return (ENOMEM); } kwr = wrtod(wr); memset(kwr, 0, kwrlen); kwr->wr_hi = htobe32(V_FW_WR_OP(FW_ULPTX_WR) | F_FW_WR_COMPL | F_FW_WR_ATOMIC); kwr->wr_mid = htobe32(V_FW_WR_LEN16(DIV_ROUND_UP(len, 16)) | V_FW_WR_FLOWID(toep->tid)); kwr->protocol = get_proto_ver(k_ctx->proto_ver); kwr->mfs = htons(k_ctx->frag_size); kwr->reneg_to_write_rx = k_ctx->l_p_key; /* master command */ kwr->cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE) | V_T5_ULP_MEMIO_ORDER(1) | V_T5_ULP_MEMIO_IMM(1)); kwr->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(kctxlen >> 5)); kwr->len16 = htobe32((toep->tid << 8) | DIV_ROUND_UP(len - sizeof(struct work_request_hdr), 16)); kwr->kaddr = htobe32(V_ULP_MEMIO_ADDR(keyid >> 5)); /* sub command */ kwr->sc_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); kwr->sc_len = htobe32(kctxlen); kctx = (struct tls_keyctx *)(kwr + 1); memset(kctx, 0, kctxlen); if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_TX) { tls_ofld->tx_key_addr = keyid; prepare_txkey_wr(kctx, k_ctx); } else if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_RX) { tls_ofld->rx_key_addr = keyid; prepare_rxkey_wr(kctx, k_ctx); } txsd = &toep->txsd[toep->txsd_pidx]; txsd->tx_credits = DIV_ROUND_UP(len, 16); txsd->plen = 0; toep->tx_credits -= txsd->tx_credits; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) toep->txsd_pidx = 0; toep->txsd_avail--; t4_wrq_tx(sc, wr); return (0); } /* Store a key received from SSL in DDR. */ static int program_key_context(struct tcpcb *tp, struct toepcb *toep, struct tls_key_context *uk_ctx) { struct adapter *sc = td_adapter(toep->td); struct tls_ofld_info *tls_ofld = &toep->tls; struct tls_key_context *k_ctx; int error, key_offset; if (tp->t_state != TCPS_ESTABLISHED) { /* * XXX: Matches Linux driver, but not sure this is a * very appropriate error. */ return (ENOENT); } /* Stop timer on handshake completion */ tls_stop_handshake_timer(toep); toep->flags &= ~TPF_FORCE_CREDITS; CTR4(KTR_CXGBE, "%s: tid %d %s proto_ver %#x", __func__, toep->tid, G_KEY_GET_LOC(uk_ctx->l_p_key) == KEY_WRITE_RX ? "KEY_WRITE_RX" : "KEY_WRITE_TX", uk_ctx->proto_ver); if (G_KEY_GET_LOC(uk_ctx->l_p_key) == KEY_WRITE_RX && ulp_mode(toep) != ULP_MODE_TLS) return (EOPNOTSUPP); /* Don't copy the 'tx' and 'rx' fields. */ k_ctx = &tls_ofld->k_ctx; memcpy(&k_ctx->l_p_key, &uk_ctx->l_p_key, sizeof(*k_ctx) - offsetof(struct tls_key_context, l_p_key)); /* TLS version != 1.1 and !1.2 OR DTLS != 1.2 */ if (get_proto_ver(k_ctx->proto_ver) > DTLS_1_2_VERSION) { if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_RX) { tls_ofld->rx_key_addr = -1; t4_clear_rx_quiesce(toep); } else { tls_ofld->tx_key_addr = -1; } return (0); } if (k_ctx->state.enc_mode == CH_EVP_CIPH_GCM_MODE) { k_ctx->iv_size = 4; k_ctx->mac_first = 0; k_ctx->hmac_ctrl = 0; } else { k_ctx->iv_size = 8; /* for CBC, iv is 16B, unit of 2B */ k_ctx->mac_first = 1; } tls_ofld->scmd0.seqno_numivs = (V_SCMD_SEQ_NO_CTRL(3) | V_SCMD_PROTO_VERSION(get_proto_ver(k_ctx->proto_ver)) | V_SCMD_ENC_DEC_CTRL(SCMD_ENCDECCTRL_ENCRYPT) | V_SCMD_CIPH_AUTH_SEQ_CTRL((k_ctx->mac_first == 0)) | V_SCMD_CIPH_MODE(k_ctx->state.enc_mode) | V_SCMD_AUTH_MODE(k_ctx->state.auth_mode) | V_SCMD_HMAC_CTRL(k_ctx->hmac_ctrl) | V_SCMD_IV_SIZE(k_ctx->iv_size)); tls_ofld->scmd0.ivgen_hdrlen = (V_SCMD_IV_GEN_CTRL(k_ctx->iv_ctrl) | V_SCMD_KEY_CTX_INLINE(0) | V_SCMD_TLS_FRAG_ENABLE(1)); tls_ofld->mac_length = k_ctx->mac_secret_size; if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_RX) { k_ctx->rx = uk_ctx->rx; /* Dont initialize key for re-neg */ if (!G_KEY_CLR_LOC(k_ctx->l_p_key)) tls_ofld->rx_key_addr = -1; } else { k_ctx->tx = uk_ctx->tx; /* Dont initialize key for re-neg */ if (!G_KEY_CLR_LOC(k_ctx->l_p_key)) tls_ofld->tx_key_addr = -1; } /* Flush pending data before new Tx key becomes active */ if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_TX) { struct sockbuf *sb; /* XXX: This might not drain everything. */ t4_push_frames(sc, toep, 0); sb = &toep->inp->inp_socket->so_snd; SOCKBUF_LOCK(sb); /* XXX: This asserts that everything has been pushed. */ MPASS(sb->sb_sndptr == NULL || sb->sb_sndptr->m_next == NULL); sb->sb_sndptr = NULL; tls_ofld->sb_off = sbavail(sb); SOCKBUF_UNLOCK(sb); tls_ofld->tx_seq_no = 0; } if ((G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_RX) || (tls_ofld->key_location == TLS_SFO_WR_CONTEXTLOC_DDR)) { /* * XXX: The userland library sets tx_key_info_size, not * rx_key_info_size. */ k_ctx->rx_key_info_size = k_ctx->tx_key_info_size; error = tls_program_key_id(toep, k_ctx); if (error) { /* XXX: Only clear quiesce for KEY_WRITE_RX? */ t4_clear_rx_quiesce(toep); return (error); } } if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_RX) { /* * RX key tags are an index into the key portion of MA * memory stored as an offset from the base address in * units of 64 bytes. */ key_offset = tls_ofld->rx_key_addr - sc->vres.key.start; t4_set_tls_keyid(toep, key_offset / 64); t4_set_tls_tcb_field(toep, W_TCB_ULP_RAW, V_TCB_ULP_RAW(M_TCB_ULP_RAW), V_TCB_ULP_RAW((V_TF_TLS_KEY_SIZE(3) | V_TF_TLS_CONTROL(1) | V_TF_TLS_ACTIVE(1) | V_TF_TLS_ENABLE(1)))); t4_set_tls_tcb_field(toep, W_TCB_TLS_SEQ, V_TCB_TLS_SEQ(M_TCB_TLS_SEQ), V_TCB_TLS_SEQ(0)); t4_clear_rx_quiesce(toep); } else { unsigned short pdus_per_ulp; if (tls_ofld->key_location == TLS_SFO_WR_CONTEXTLOC_IMMEDIATE) tls_ofld->tx_key_addr = 1; tls_ofld->fcplenmax = get_tp_plen_max(tls_ofld); tls_ofld->expn_per_ulp = tls_expansion_size(toep, tls_ofld->fcplenmax, 1, &pdus_per_ulp); tls_ofld->pdus_per_ulp = pdus_per_ulp; tls_ofld->adjusted_plen = tls_ofld->pdus_per_ulp * ((tls_ofld->expn_per_ulp/tls_ofld->pdus_per_ulp) + tls_ofld->k_ctx.frag_size); } return (0); } /* * In some cases a client connection can hang without sending the * ServerHelloDone message from the NIC to the host. Send a dummy * RX_DATA_ACK with RX_MODULATE to unstick the connection. */ static void tls_send_handshake_ack(void *arg) { struct toepcb *toep = arg; struct tls_ofld_info *tls_ofld = &toep->tls; struct adapter *sc = td_adapter(toep->td); /* * XXX: Does not have the t4_get_tcb() checks to refine the * workaround. */ callout_schedule(&tls_ofld->handshake_timer, TLS_SRV_HELLO_RD_TM * hz); CTR2(KTR_CXGBE, "%s: tid %d sending RX_DATA_ACK", __func__, toep->tid); send_rx_modulate(sc, toep); } static void tls_start_handshake_timer(struct toepcb *toep) { struct tls_ofld_info *tls_ofld = &toep->tls; mtx_lock(&tls_handshake_lock); callout_reset(&tls_ofld->handshake_timer, TLS_SRV_HELLO_BKOFF_TM * hz, tls_send_handshake_ack, toep); mtx_unlock(&tls_handshake_lock); } void tls_stop_handshake_timer(struct toepcb *toep) { struct tls_ofld_info *tls_ofld = &toep->tls; mtx_lock(&tls_handshake_lock); callout_stop(&tls_ofld->handshake_timer); mtx_unlock(&tls_handshake_lock); } int t4_ctloutput_tls(struct socket *so, struct sockopt *sopt) { struct tls_key_context uk_ctx; struct inpcb *inp; struct tcpcb *tp; struct toepcb *toep; int error, optval; error = 0; if (sopt->sopt_dir == SOPT_SET && sopt->sopt_name == TCP_TLSOM_SET_TLS_CONTEXT) { error = sooptcopyin(sopt, &uk_ctx, sizeof(uk_ctx), sizeof(uk_ctx)); if (error) return (error); } inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); return (ECONNRESET); } tp = intotcpcb(inp); toep = tp->t_toe; switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { case TCP_TLSOM_SET_TLS_CONTEXT: if (toep->tls.mode == TLS_MODE_KTLS) error = EINVAL; else { error = program_key_context(tp, toep, &uk_ctx); if (error == 0) toep->tls.mode = TLS_MODE_TLSOM; } INP_WUNLOCK(inp); break; case TCP_TLSOM_CLR_TLS_TOM: if (toep->tls.mode == TLS_MODE_KTLS) error = EINVAL; else if (ulp_mode(toep) == ULP_MODE_TLS) { CTR2(KTR_CXGBE, "%s: tid %d CLR_TLS_TOM", __func__, toep->tid); tls_clr_ofld_mode(toep); } else error = EOPNOTSUPP; INP_WUNLOCK(inp); break; case TCP_TLSOM_CLR_QUIES: if (toep->tls.mode == TLS_MODE_KTLS) error = EINVAL; else if (ulp_mode(toep) == ULP_MODE_TLS) { CTR2(KTR_CXGBE, "%s: tid %d CLR_QUIES", __func__, toep->tid); tls_clr_quiesce(toep); } else error = EOPNOTSUPP; INP_WUNLOCK(inp); break; default: INP_WUNLOCK(inp); error = EOPNOTSUPP; break; } break; case SOPT_GET: switch (sopt->sopt_name) { case TCP_TLSOM_GET_TLS_TOM: /* * TLS TX is permitted on any TOE socket, but * TLS RX requires a TLS ULP mode. */ optval = TLS_TOM_NONE; if (can_tls_offload(td_adapter(toep->td)) && toep->tls.mode != TLS_MODE_KTLS) { switch (ulp_mode(toep)) { case ULP_MODE_NONE: case ULP_MODE_TCPDDP: optval = TLS_TOM_TXONLY; break; case ULP_MODE_TLS: optval = TLS_TOM_BOTH; break; } } CTR3(KTR_CXGBE, "%s: tid %d GET_TLS_TOM = %d", __func__, toep->tid, optval); INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(optval)); break; default: INP_WUNLOCK(inp); error = EOPNOTSUPP; break; } break; } return (error); } #ifdef KERN_TLS static void init_ktls_key_context(struct ktls_session *tls, struct tls_key_context *k_ctx, int direction) { struct auth_hash *axf; u_int key_info_size, mac_key_size; char *hash, *key; k_ctx->l_p_key = V_KEY_GET_LOC(direction == KTLS_TX ? KEY_WRITE_TX : KEY_WRITE_RX); k_ctx->proto_ver = tls->params.tls_vmajor << 8 | tls->params.tls_vminor; k_ctx->cipher_secret_size = tls->params.cipher_key_len; key_info_size = sizeof(struct tx_keyctx_hdr) + k_ctx->cipher_secret_size; if (direction == KTLS_TX) key = k_ctx->tx.key; else key = k_ctx->rx.key; memcpy(key, tls->params.cipher_key, tls->params.cipher_key_len); hash = key + tls->params.cipher_key_len; if (tls->params.cipher_algorithm == CRYPTO_AES_NIST_GCM_16) { k_ctx->state.auth_mode = SCMD_AUTH_MODE_GHASH; k_ctx->state.enc_mode = SCMD_CIPH_MODE_AES_GCM; k_ctx->iv_size = 4; k_ctx->mac_first = 0; k_ctx->hmac_ctrl = SCMD_HMAC_CTRL_NOP; key_info_size += GMAC_BLOCK_LEN; k_ctx->mac_secret_size = 0; if (direction == KTLS_TX) memcpy(k_ctx->tx.salt, tls->params.iv, SALT_SIZE); else memcpy(k_ctx->rx.salt, tls->params.iv, SALT_SIZE); t4_init_gmac_hash(tls->params.cipher_key, tls->params.cipher_key_len, hash); } else { switch (tls->params.auth_algorithm) { case CRYPTO_SHA1_HMAC: axf = &auth_hash_hmac_sha1; mac_key_size = SHA1_HASH_LEN; k_ctx->state.auth_mode = SCMD_AUTH_MODE_SHA1; break; case CRYPTO_SHA2_256_HMAC: axf = &auth_hash_hmac_sha2_256; mac_key_size = SHA2_256_HASH_LEN; k_ctx->state.auth_mode = SCMD_AUTH_MODE_SHA256; break; case CRYPTO_SHA2_384_HMAC: axf = &auth_hash_hmac_sha2_384; mac_key_size = SHA2_512_HASH_LEN; k_ctx->state.auth_mode = SCMD_AUTH_MODE_SHA512_384; break; default: panic("bad auth mode"); } k_ctx->state.enc_mode = SCMD_CIPH_MODE_AES_CBC; k_ctx->iv_size = 8; /* for CBC, iv is 16B, unit of 2B */ k_ctx->mac_first = 1; k_ctx->hmac_ctrl = SCMD_HMAC_CTRL_NO_TRUNC; key_info_size += roundup2(mac_key_size, 16) * 2; k_ctx->mac_secret_size = mac_key_size; t4_init_hmac_digest(axf, mac_key_size, tls->params.auth_key, tls->params.auth_key_len, hash); } if (direction == KTLS_TX) k_ctx->tx_key_info_size = key_info_size; else k_ctx->rx_key_info_size = key_info_size; k_ctx->frag_size = tls->params.max_frame_len; k_ctx->iv_ctrl = 1; } int tls_alloc_ktls(struct toepcb *toep, struct ktls_session *tls, int direction) { struct adapter *sc = td_adapter(toep->td); struct tls_key_context *k_ctx; int error, key_offset; if (toep->tls.mode == TLS_MODE_TLSOM) return (EINVAL); if (!can_tls_offload(td_adapter(toep->td))) return (EINVAL); switch (ulp_mode(toep)) { case ULP_MODE_TLS: break; case ULP_MODE_NONE: case ULP_MODE_TCPDDP: if (direction != KTLS_TX) return (EINVAL); break; default: return (EINVAL); } switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: /* XXX: Explicitly ignore any provided IV. */ switch (tls->params.cipher_key_len) { case 128 / 8: case 192 / 8: case 256 / 8: break; default: return (EINVAL); } switch (tls->params.auth_algorithm) { case CRYPTO_SHA1_HMAC: case CRYPTO_SHA2_256_HMAC: case CRYPTO_SHA2_384_HMAC: break; default: return (EPROTONOSUPPORT); } break; case CRYPTO_AES_NIST_GCM_16: if (tls->params.iv_len != SALT_SIZE) return (EINVAL); switch (tls->params.cipher_key_len) { case 128 / 8: case 192 / 8: case 256 / 8: break; default: return (EINVAL); } break; default: return (EPROTONOSUPPORT); } /* Only TLS 1.1 and TLS 1.2 are currently supported. */ if (tls->params.tls_vmajor != TLS_MAJOR_VER_ONE || tls->params.tls_vminor < TLS_MINOR_VER_ONE || tls->params.tls_vminor > TLS_MINOR_VER_TWO) return (EPROTONOSUPPORT); /* Bail if we already have a key. */ if (direction == KTLS_TX) { if (toep->tls.tx_key_addr != -1) return (EOPNOTSUPP); } else { if (toep->tls.rx_key_addr != -1) return (EOPNOTSUPP); } /* * XXX: This assumes no key renegotation. If KTLS ever supports * that we will want to allocate TLS sessions dynamically rather * than as a static member of toep. */ k_ctx = &toep->tls.k_ctx; init_ktls_key_context(tls, k_ctx, direction); error = tls_program_key_id(toep, k_ctx); if (error) return (error); if (direction == KTLS_TX) { toep->tls.scmd0.seqno_numivs = (V_SCMD_SEQ_NO_CTRL(3) | V_SCMD_PROTO_VERSION(get_proto_ver(k_ctx->proto_ver)) | V_SCMD_ENC_DEC_CTRL(SCMD_ENCDECCTRL_ENCRYPT) | V_SCMD_CIPH_AUTH_SEQ_CTRL((k_ctx->mac_first == 0)) | V_SCMD_CIPH_MODE(k_ctx->state.enc_mode) | V_SCMD_AUTH_MODE(k_ctx->state.auth_mode) | V_SCMD_HMAC_CTRL(k_ctx->hmac_ctrl) | V_SCMD_IV_SIZE(k_ctx->iv_size)); toep->tls.scmd0.ivgen_hdrlen = (V_SCMD_IV_GEN_CTRL(k_ctx->iv_ctrl) | V_SCMD_KEY_CTX_INLINE(0) | V_SCMD_TLS_FRAG_ENABLE(1)); if (tls->params.cipher_algorithm == CRYPTO_AES_NIST_GCM_16) toep->tls.iv_len = 8; else toep->tls.iv_len = AES_BLOCK_LEN; toep->tls.mac_length = k_ctx->mac_secret_size; toep->tls.fcplenmax = get_tp_plen_max(&toep->tls); toep->tls.expn_per_ulp = tls->params.tls_hlen + tls->params.tls_tlen; toep->tls.pdus_per_ulp = 1; toep->tls.adjusted_plen = toep->tls.expn_per_ulp + toep->tls.k_ctx.frag_size; } else { /* Stop timer on handshake completion */ tls_stop_handshake_timer(toep); toep->flags &= ~TPF_FORCE_CREDITS; /* * RX key tags are an index into the key portion of MA * memory stored as an offset from the base address in * units of 64 bytes. */ key_offset = toep->tls.rx_key_addr - sc->vres.key.start; t4_set_tls_keyid(toep, key_offset / 64); t4_set_tls_tcb_field(toep, W_TCB_ULP_RAW, V_TCB_ULP_RAW(M_TCB_ULP_RAW), V_TCB_ULP_RAW((V_TF_TLS_KEY_SIZE(3) | V_TF_TLS_CONTROL(1) | V_TF_TLS_ACTIVE(1) | V_TF_TLS_ENABLE(1)))); t4_set_tls_tcb_field(toep, W_TCB_TLS_SEQ, V_TCB_TLS_SEQ(M_TCB_TLS_SEQ), V_TCB_TLS_SEQ(0)); t4_clear_rx_quiesce(toep); } toep->tls.mode = TLS_MODE_KTLS; return (0); } #endif void tls_init_toep(struct toepcb *toep) { struct tls_ofld_info *tls_ofld = &toep->tls; tls_ofld->mode = TLS_MODE_OFF; tls_ofld->key_location = TLS_SFO_WR_CONTEXTLOC_DDR; tls_ofld->rx_key_addr = -1; tls_ofld->tx_key_addr = -1; if (ulp_mode(toep) == ULP_MODE_TLS) callout_init_mtx(&tls_ofld->handshake_timer, &tls_handshake_lock, 0); } void tls_establish(struct toepcb *toep) { /* * Enable PDU extraction. * * XXX: Supposedly this should be done by the firmware when * the ULP_MODE FLOWC parameter is set in send_flowc_wr(), but * in practice this seems to be required. */ CTR2(KTR_CXGBE, "%s: tid %d setting TLS_ENABLE", __func__, toep->tid); t4_set_tls_tcb_field(toep, W_TCB_ULP_RAW, V_TCB_ULP_RAW(M_TCB_ULP_RAW), V_TCB_ULP_RAW(V_TF_TLS_ENABLE(1))); toep->flags |= TPF_FORCE_CREDITS; tls_start_handshake_timer(toep); } void tls_uninit_toep(struct toepcb *toep) { if (ulp_mode(toep) == ULP_MODE_TLS) tls_stop_handshake_timer(toep); clear_tls_keyid(toep); } #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16) #define MIN_OFLD_TLSTX_CREDITS(toep) \ (howmany(sizeof(struct fw_tlstx_data_wr) + \ sizeof(struct cpl_tx_tls_sfo) + key_size((toep)) + \ CIPHER_BLOCK_SIZE + 1, 16)) static inline u_int max_imm_tls_space(int tx_credits) { const int n = 2; /* Use only up to 2 desc for imm. data WR */ int space; KASSERT(tx_credits >= 0 && tx_credits <= MAX_OFLD_TX_CREDITS, ("%s: %d credits", __func__, tx_credits)); if (tx_credits >= (n * EQ_ESIZE) / 16) space = (n * EQ_ESIZE); else space = tx_credits * 16; return (space); } static int count_mbuf_segs(struct mbuf *m, int skip, int len, int *max_nsegs_1mbufp) { int max_nsegs_1mbuf, n, nsegs; while (skip >= m->m_len) { skip -= m->m_len; m = m->m_next; } nsegs = 0; max_nsegs_1mbuf = 0; while (len > 0) { n = sglist_count(mtod(m, char *) + skip, m->m_len - skip); if (n > max_nsegs_1mbuf) max_nsegs_1mbuf = n; nsegs += n; len -= m->m_len - skip; skip = 0; m = m->m_next; } *max_nsegs_1mbufp = max_nsegs_1mbuf; return (nsegs); } static void write_tlstx_wr(struct fw_tlstx_data_wr *txwr, struct toepcb *toep, unsigned int immdlen, unsigned int plen, unsigned int expn, unsigned int pdus, uint8_t credits, int shove, int imm_ivs) { struct tls_ofld_info *tls_ofld = &toep->tls; unsigned int len = plen + expn; txwr->op_to_immdlen = htobe32(V_WR_OP(FW_TLSTX_DATA_WR) | V_FW_TLSTX_DATA_WR_COMPL(1) | V_FW_TLSTX_DATA_WR_IMMDLEN(immdlen)); txwr->flowid_len16 = htobe32(V_FW_TLSTX_DATA_WR_FLOWID(toep->tid) | V_FW_TLSTX_DATA_WR_LEN16(credits)); txwr->plen = htobe32(len); txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ULP_MODE_TLS) | V_TX_URG(0) | /* F_T6_TX_FORCE | */ V_TX_SHOVE(shove)); txwr->ctxloc_to_exp = htobe32(V_FW_TLSTX_DATA_WR_NUMIVS(pdus) | V_FW_TLSTX_DATA_WR_EXP(expn) | V_FW_TLSTX_DATA_WR_CTXLOC(tls_ofld->key_location) | V_FW_TLSTX_DATA_WR_IVDSGL(!imm_ivs) | V_FW_TLSTX_DATA_WR_KEYSIZE(tls_ofld->k_ctx.tx_key_info_size >> 4)); txwr->mfs = htobe16(tls_ofld->k_ctx.frag_size); txwr->adjustedplen_pkd = htobe16( V_FW_TLSTX_DATA_WR_ADJUSTEDPLEN(tls_ofld->adjusted_plen)); txwr->expinplenmax_pkd = htobe16( V_FW_TLSTX_DATA_WR_EXPINPLENMAX(tls_ofld->expn_per_ulp)); txwr->pdusinplenmax_pkd = V_FW_TLSTX_DATA_WR_PDUSINPLENMAX(tls_ofld->pdus_per_ulp); } static void write_tlstx_cpl(struct cpl_tx_tls_sfo *cpl, struct toepcb *toep, struct tls_hdr *tls_hdr, unsigned int plen, unsigned int pdus) { struct tls_ofld_info *tls_ofld = &toep->tls; int data_type, seglen; if (plen < tls_ofld->k_ctx.frag_size) seglen = plen; else seglen = tls_ofld->k_ctx.frag_size; data_type = tls_content_type(tls_hdr->type); cpl->op_to_seg_len = htobe32(V_CPL_TX_TLS_SFO_OPCODE(CPL_TX_TLS_SFO) | V_CPL_TX_TLS_SFO_DATA_TYPE(data_type) | V_CPL_TX_TLS_SFO_CPL_LEN(2) | V_CPL_TX_TLS_SFO_SEG_LEN(seglen)); cpl->pld_len = htobe32(plen); if (data_type == CPL_TX_TLS_SFO_TYPE_HEARTBEAT) cpl->type_protover = htobe32( V_CPL_TX_TLS_SFO_TYPE(tls_hdr->type)); cpl->seqno_numivs = htobe32(tls_ofld->scmd0.seqno_numivs | V_SCMD_NUM_IVS(pdus)); cpl->ivgen_hdrlen = htobe32(tls_ofld->scmd0.ivgen_hdrlen); cpl->scmd1 = htobe64(tls_ofld->tx_seq_no); tls_ofld->tx_seq_no += pdus; } /* * Similar to write_tx_sgl() except that it accepts an optional * trailer buffer for IVs. */ static void write_tlstx_sgl(void *dst, struct mbuf *start, int skip, int plen, void *iv_buffer, int iv_len, int nsegs, int n) { struct mbuf *m; struct ulptx_sgl *usgl = dst; int i, j, rc; struct sglist sg; struct sglist_seg segs[n]; KASSERT(nsegs > 0, ("%s: nsegs 0", __func__)); sglist_init(&sg, n, segs); usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | V_ULPTX_NSGE(nsegs)); for (m = start; skip >= m->m_len; m = m->m_next) skip -= m->m_len; i = -1; for (m = start; plen > 0; m = m->m_next) { rc = sglist_append(&sg, mtod(m, char *) + skip, m->m_len - skip); if (__predict_false(rc != 0)) panic("%s: sglist_append %d", __func__, rc); plen -= m->m_len - skip; skip = 0; for (j = 0; j < sg.sg_nseg; i++, j++) { if (i < 0) { usgl->len0 = htobe32(segs[j].ss_len); usgl->addr0 = htobe64(segs[j].ss_paddr); } else { usgl->sge[i / 2].len[i & 1] = htobe32(segs[j].ss_len); usgl->sge[i / 2].addr[i & 1] = htobe64(segs[j].ss_paddr); } #ifdef INVARIANTS nsegs--; #endif } sglist_reset(&sg); } if (iv_buffer != NULL) { rc = sglist_append(&sg, iv_buffer, iv_len); if (__predict_false(rc != 0)) panic("%s: sglist_append %d", __func__, rc); for (j = 0; j < sg.sg_nseg; i++, j++) { if (i < 0) { usgl->len0 = htobe32(segs[j].ss_len); usgl->addr0 = htobe64(segs[j].ss_paddr); } else { usgl->sge[i / 2].len[i & 1] = htobe32(segs[j].ss_len); usgl->sge[i / 2].addr[i & 1] = htobe64(segs[j].ss_paddr); } #ifdef INVARIANTS nsegs--; #endif } } if (i & 1) usgl->sge[i / 2].len[1] = htobe32(0); KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, iv_buffer %p", __func__, nsegs, start, iv_buffer)); } /* * Similar to t4_push_frames() but handles TLS sockets when TLS offload * is enabled. Rather than transmitting bulk data, the socket buffer * contains TLS records. The work request requires a full TLS record, * so batch mbufs up until a full TLS record is seen. This requires * reading the TLS header out of the start of each record to determine * its length. */ void t4_push_tls_records(struct adapter *sc, struct toepcb *toep, int drop) { struct tls_hdr thdr; struct mbuf *sndptr; struct fw_tlstx_data_wr *txwr; struct cpl_tx_tls_sfo *cpl; struct wrqe *wr; u_int plen, nsegs, credits, space, max_nsegs_1mbuf, wr_len; u_int expn_size, iv_len, pdus, sndptroff; struct tls_ofld_info *tls_ofld = &toep->tls; struct inpcb *inp = toep->inp; struct tcpcb *tp = intotcpcb(inp); struct socket *so = inp->inp_socket; struct sockbuf *sb = &so->so_snd; int tls_size, tx_credits, shove, /* compl,*/ sowwakeup; struct ofld_tx_sdesc *txsd; bool imm_ivs, imm_payload; void *iv_buffer, *iv_dst, *buf; INP_WLOCK_ASSERT(inp); KASSERT(toep->flags & TPF_FLOWC_WR_SENT, ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); KASSERT(ulp_mode(toep) == ULP_MODE_NONE || ulp_mode(toep) == ULP_MODE_TCPDDP || ulp_mode(toep) == ULP_MODE_TLS, ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); KASSERT(tls_tx_key(toep), ("%s: TX key not set for toep %p", __func__, toep)); #ifdef VERBOSE_TRACES CTR4(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d", __func__, toep->tid, toep->flags, tp->t_flags); #endif if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) return; #ifdef RATELIMIT if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) && (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) { inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; } #endif /* * This function doesn't resume by itself. Someone else must clear the * flag and call this function. */ if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { KASSERT(drop == 0, ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); return; } txsd = &toep->txsd[toep->txsd_pidx]; for (;;) { tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); space = max_imm_tls_space(tx_credits); wr_len = sizeof(struct fw_tlstx_data_wr) + sizeof(struct cpl_tx_tls_sfo) + key_size(toep); if (wr_len + CIPHER_BLOCK_SIZE + 1 > space) { #ifdef VERBOSE_TRACES CTR5(KTR_CXGBE, "%s: tid %d tx_credits %d min_wr %d space %d", __func__, toep->tid, tx_credits, wr_len + CIPHER_BLOCK_SIZE + 1, space); #endif return; } SOCKBUF_LOCK(sb); sowwakeup = drop; if (drop) { sbdrop_locked(sb, drop); MPASS(tls_ofld->sb_off >= drop); tls_ofld->sb_off -= drop; drop = 0; } /* * Send a FIN if requested, but only if there's no * more data to send. */ if (sbavail(sb) == tls_ofld->sb_off && toep->flags & TPF_SEND_FIN) { if (sowwakeup) sowwakeup_locked(so); else SOCKBUF_UNLOCK(sb); SOCKBUF_UNLOCK_ASSERT(sb); t4_close_conn(sc, toep); return; } if (sbavail(sb) < tls_ofld->sb_off + TLS_HEADER_LENGTH) { /* * A full TLS header is not yet queued, stop * for now until more data is added to the * socket buffer. However, if the connection * has been closed, we will never get the rest * of the header so just discard the partial * header and close the connection. */ #ifdef VERBOSE_TRACES CTR5(KTR_CXGBE, "%s: tid %d sbavail %d sb_off %d%s", __func__, toep->tid, sbavail(sb), tls_ofld->sb_off, toep->flags & TPF_SEND_FIN ? "" : " SEND_FIN"); #endif if (sowwakeup) sowwakeup_locked(so); else SOCKBUF_UNLOCK(sb); SOCKBUF_UNLOCK_ASSERT(sb); if (toep->flags & TPF_SEND_FIN) t4_close_conn(sc, toep); return; } /* Read the header of the next TLS record. */ sndptr = sbsndmbuf(sb, tls_ofld->sb_off, &sndptroff); m_copydata(sndptr, sndptroff, sizeof(thdr), (caddr_t)&thdr); tls_size = htons(thdr.length); plen = TLS_HEADER_LENGTH + tls_size; pdus = howmany(tls_size, tls_ofld->k_ctx.frag_size); iv_len = pdus * CIPHER_BLOCK_SIZE; if (sbavail(sb) < tls_ofld->sb_off + plen) { /* * The full TLS record is not yet queued, stop * for now until more data is added to the * socket buffer. However, if the connection * has been closed, we will never get the rest * of the record so just discard the partial * record and close the connection. */ #ifdef VERBOSE_TRACES CTR6(KTR_CXGBE, "%s: tid %d sbavail %d sb_off %d plen %d%s", __func__, toep->tid, sbavail(sb), tls_ofld->sb_off, plen, toep->flags & TPF_SEND_FIN ? "" : " SEND_FIN"); #endif if (sowwakeup) sowwakeup_locked(so); else SOCKBUF_UNLOCK(sb); SOCKBUF_UNLOCK_ASSERT(sb); if (toep->flags & TPF_SEND_FIN) t4_close_conn(sc, toep); return; } /* Shove if there is no additional data pending. */ shove = (sbavail(sb) == tls_ofld->sb_off + plen) && !(tp->t_flags & TF_MORETOCOME); if (sb->sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf && sb->sb_hiwat < V_tcp_autosndbuf_max && sbused(sb) >= sb->sb_hiwat * 7 / 8) { int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc, V_tcp_autosndbuf_max); if (!sbreserve_locked(sb, newsize, so, NULL)) sb->sb_flags &= ~SB_AUTOSIZE; else sowwakeup = 1; /* room available */ } if (sowwakeup) sowwakeup_locked(so); else SOCKBUF_UNLOCK(sb); SOCKBUF_UNLOCK_ASSERT(sb); if (__predict_false(toep->flags & TPF_FIN_SENT)) panic("%s: excess tx.", __func__); /* Determine whether to use immediate vs SGL. */ imm_payload = false; imm_ivs = false; if (wr_len + iv_len <= space) { imm_ivs = true; wr_len += iv_len; if (wr_len + tls_size <= space) { wr_len += tls_size; imm_payload = true; } } /* Allocate space for IVs if needed. */ if (!imm_ivs) { iv_buffer = malloc(iv_len, M_CXGBE, M_NOWAIT); if (iv_buffer == NULL) { /* * XXX: How to restart this? */ if (sowwakeup) sowwakeup_locked(so); else SOCKBUF_UNLOCK(sb); SOCKBUF_UNLOCK_ASSERT(sb); CTR3(KTR_CXGBE, "%s: tid %d failed to alloc IV space len %d", __func__, toep->tid, iv_len); return; } } else iv_buffer = NULL; /* Determine size of SGL. */ nsegs = 0; max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ if (!imm_payload) { nsegs = count_mbuf_segs(sndptr, sndptroff + TLS_HEADER_LENGTH, tls_size, &max_nsegs_1mbuf); if (!imm_ivs) { int n = sglist_count(iv_buffer, iv_len); nsegs += n; if (n > max_nsegs_1mbuf) max_nsegs_1mbuf = n; } /* Account for SGL in work request length. */ wr_len += sizeof(struct ulptx_sgl) + ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; } wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq); if (wr == NULL) { /* XXX: how will we recover from this? */ toep->flags |= TPF_TX_SUSPENDED; return; } #ifdef VERBOSE_TRACES CTR5(KTR_CXGBE, "%s: tid %d TLS record %d len %#x pdus %d", __func__, toep->tid, thdr.type, tls_size, pdus); #endif txwr = wrtod(wr); cpl = (struct cpl_tx_tls_sfo *)(txwr + 1); memset(txwr, 0, roundup2(wr_len, 16)); credits = howmany(wr_len, 16); expn_size = tls_expansion_size(toep, tls_size, 0, NULL); write_tlstx_wr(txwr, toep, imm_payload ? tls_size : 0, tls_size, expn_size, pdus, credits, shove, imm_ivs ? 1 : 0); write_tlstx_cpl(cpl, toep, &thdr, tls_size, pdus); tls_copy_tx_key(toep, cpl + 1); /* Generate random IVs */ buf = (char *)(cpl + 1) + key_size(toep); if (imm_ivs) { MPASS(iv_buffer == NULL); iv_dst = buf; buf = (char *)iv_dst + iv_len; } else iv_dst = iv_buffer; arc4rand(iv_dst, iv_len, 0); if (imm_payload) { m_copydata(sndptr, sndptroff + TLS_HEADER_LENGTH, tls_size, buf); } else { write_tlstx_sgl(buf, sndptr, sndptroff + TLS_HEADER_LENGTH, tls_size, iv_buffer, iv_len, nsegs, max_nsegs_1mbuf); } KASSERT(toep->tx_credits >= credits, ("%s: not enough credits", __func__)); toep->tx_credits -= credits; tp->snd_nxt += plen; tp->snd_max += plen; SOCKBUF_LOCK(sb); sbsndptr_adv(sb, sb->sb_sndptr, plen); tls_ofld->sb_off += plen; SOCKBUF_UNLOCK(sb); toep->flags |= TPF_TX_DATA_SENT; if (toep->tx_credits < MIN_OFLD_TLSTX_CREDITS(toep)) toep->flags |= TPF_TX_SUSPENDED; KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); txsd->plen = plen; txsd->tx_credits = credits; txsd->iv_buffer = iv_buffer; txsd++; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { toep->txsd_pidx = 0; txsd = &toep->txsd[0]; } toep->txsd_avail--; atomic_add_long(&toep->vi->pi->tx_toe_tls_records, 1); atomic_add_long(&toep->vi->pi->tx_toe_tls_octets, plen); t4_l2t_send(sc, wr, toep->l2te); } } #ifdef KERN_TLS static int count_ext_pgs_segs(struct mbuf *m) { vm_paddr_t nextpa; u_int i, nsegs; MPASS(m->m_epg_npgs > 0); nsegs = 1; nextpa = m->m_epg_pa[0] + PAGE_SIZE; for (i = 1; i < m->m_epg_npgs; i++) { if (nextpa != m->m_epg_pa[i]) nsegs++; nextpa = m->m_epg_pa[i] + PAGE_SIZE; } return (nsegs); } static void write_ktlstx_sgl(void *dst, struct mbuf *m, int nsegs) { struct ulptx_sgl *usgl = dst; vm_paddr_t pa; uint32_t len; int i, j; KASSERT(nsegs > 0, ("%s: nsegs 0", __func__)); usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | V_ULPTX_NSGE(nsegs)); /* Figure out the first S/G length. */ pa = m->m_epg_pa[0] + m->m_epg_1st_off; usgl->addr0 = htobe64(pa); len = m_epg_pagelen(m, 0, m->m_epg_1st_off); pa += len; for (i = 1; i < m->m_epg_npgs; i++) { if (m->m_epg_pa[i] != pa) break; len += m_epg_pagelen(m, i, 0); pa += m_epg_pagelen(m, i, 0); } usgl->len0 = htobe32(len); #ifdef INVARIANTS nsegs--; #endif j = -1; for (; i < m->m_epg_npgs; i++) { if (j == -1 || m->m_epg_pa[i] != pa) { if (j >= 0) usgl->sge[j / 2].len[j & 1] = htobe32(len); j++; #ifdef INVARIANTS nsegs--; #endif pa = m->m_epg_pa[i]; usgl->sge[j / 2].addr[j & 1] = htobe64(pa); len = m_epg_pagelen(m, i, 0); pa += len; } else { len += m_epg_pagelen(m, i, 0); pa += m_epg_pagelen(m, i, 0); } } if (j >= 0) { usgl->sge[j / 2].len[j & 1] = htobe32(len); if ((j & 1) == 0) usgl->sge[j / 2].len[1] = htobe32(0); } KASSERT(nsegs == 0, ("%s: nsegs %d, m %p", __func__, nsegs, m)); } /* * Similar to t4_push_frames() but handles sockets that contain TLS * record mbufs. Unlike TLSOM, each mbuf is a complete TLS record and * corresponds to a single work request. */ void t4_push_ktls(struct adapter *sc, struct toepcb *toep, int drop) { struct tls_hdr *thdr; struct fw_tlstx_data_wr *txwr; struct cpl_tx_tls_sfo *cpl; struct wrqe *wr; struct mbuf *m; u_int nsegs, credits, wr_len; u_int expn_size; struct inpcb *inp = toep->inp; struct tcpcb *tp = intotcpcb(inp); struct socket *so = inp->inp_socket; struct sockbuf *sb = &so->so_snd; int tls_size, tx_credits, shove, sowwakeup; struct ofld_tx_sdesc *txsd; char *buf; INP_WLOCK_ASSERT(inp); KASSERT(toep->flags & TPF_FLOWC_WR_SENT, ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); KASSERT(ulp_mode(toep) == ULP_MODE_NONE || ulp_mode(toep) == ULP_MODE_TCPDDP || ulp_mode(toep) == ULP_MODE_TLS, ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); KASSERT(tls_tx_key(toep), ("%s: TX key not set for toep %p", __func__, toep)); #ifdef VERBOSE_TRACES CTR4(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d", __func__, toep->tid, toep->flags, tp->t_flags); #endif if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) return; #ifdef RATELIMIT if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) && (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) { inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; } #endif /* * This function doesn't resume by itself. Someone else must clear the * flag and call this function. */ if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { KASSERT(drop == 0, ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); return; } txsd = &toep->txsd[toep->txsd_pidx]; for (;;) { tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); SOCKBUF_LOCK(sb); sowwakeup = drop; if (drop) { sbdrop_locked(sb, drop); drop = 0; } m = sb->sb_sndptr != NULL ? sb->sb_sndptr->m_next : sb->sb_mb; /* * Send a FIN if requested, but only if there's no * more data to send. */ if (m == NULL && toep->flags & TPF_SEND_FIN) { if (sowwakeup) sowwakeup_locked(so); else SOCKBUF_UNLOCK(sb); SOCKBUF_UNLOCK_ASSERT(sb); t4_close_conn(sc, toep); return; } /* * If there is no ready data to send, wait until more * data arrives. */ if (m == NULL || (m->m_flags & M_NOTAVAIL) != 0) { if (sowwakeup) sowwakeup_locked(so); else SOCKBUF_UNLOCK(sb); SOCKBUF_UNLOCK_ASSERT(sb); #ifdef VERBOSE_TRACES CTR2(KTR_CXGBE, "%s: tid %d no ready data to send", __func__, toep->tid); #endif return; } KASSERT(m->m_flags & M_EXTPG, ("%s: mbuf %p is not NOMAP", __func__, m)); KASSERT(m->m_epg_tls != NULL, ("%s: mbuf %p doesn't have TLS session", __func__, m)); /* Calculate WR length. */ wr_len = sizeof(struct fw_tlstx_data_wr) + sizeof(struct cpl_tx_tls_sfo) + key_size(toep); /* Explicit IVs for AES-CBC and AES-GCM are <= 16. */ MPASS(toep->tls.iv_len <= AES_BLOCK_LEN); wr_len += AES_BLOCK_LEN; /* Account for SGL in work request length. */ nsegs = count_ext_pgs_segs(m); wr_len += sizeof(struct ulptx_sgl) + ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; /* Not enough credits for this work request. */ if (howmany(wr_len, 16) > tx_credits) { if (sowwakeup) sowwakeup_locked(so); else SOCKBUF_UNLOCK(sb); SOCKBUF_UNLOCK_ASSERT(sb); #ifdef VERBOSE_TRACES CTR5(KTR_CXGBE, "%s: tid %d mbuf %p requires %d credits, but only %d available", __func__, toep->tid, m, howmany(wr_len, 16), tx_credits); #endif toep->flags |= TPF_TX_SUSPENDED; return; } /* Shove if there is no additional data pending. */ shove = ((m->m_next == NULL || (m->m_next->m_flags & M_NOTAVAIL) != 0)) && (tp->t_flags & TF_MORETOCOME) == 0; if (sb->sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf && sb->sb_hiwat < V_tcp_autosndbuf_max && sbused(sb) >= sb->sb_hiwat * 7 / 8) { int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc, V_tcp_autosndbuf_max); if (!sbreserve_locked(sb, newsize, so, NULL)) sb->sb_flags &= ~SB_AUTOSIZE; else sowwakeup = 1; /* room available */ } if (sowwakeup) sowwakeup_locked(so); else SOCKBUF_UNLOCK(sb); SOCKBUF_UNLOCK_ASSERT(sb); if (__predict_false(toep->flags & TPF_FIN_SENT)) panic("%s: excess tx.", __func__); wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq); if (wr == NULL) { /* XXX: how will we recover from this? */ toep->flags |= TPF_TX_SUSPENDED; return; } thdr = (struct tls_hdr *)&m->m_epg_hdr; #ifdef VERBOSE_TRACES CTR5(KTR_CXGBE, "%s: tid %d TLS record %ju type %d len %#x", __func__, toep->tid, m->m_epg_seqno, thdr->type, m->m_len); #endif txwr = wrtod(wr); cpl = (struct cpl_tx_tls_sfo *)(txwr + 1); memset(txwr, 0, roundup2(wr_len, 16)); credits = howmany(wr_len, 16); expn_size = m->m_epg_hdrlen + m->m_epg_trllen; tls_size = m->m_len - expn_size; write_tlstx_wr(txwr, toep, 0, tls_size, expn_size, 1, credits, shove, 1); toep->tls.tx_seq_no = m->m_epg_seqno; write_tlstx_cpl(cpl, toep, thdr, tls_size, 1); tls_copy_tx_key(toep, cpl + 1); /* Copy IV. */ buf = (char *)(cpl + 1) + key_size(toep); memcpy(buf, thdr + 1, toep->tls.iv_len); buf += AES_BLOCK_LEN; write_ktlstx_sgl(buf, m, nsegs); KASSERT(toep->tx_credits >= credits, ("%s: not enough credits", __func__)); toep->tx_credits -= credits; tp->snd_nxt += m->m_len; tp->snd_max += m->m_len; SOCKBUF_LOCK(sb); sb->sb_sndptr = m; SOCKBUF_UNLOCK(sb); toep->flags |= TPF_TX_DATA_SENT; if (toep->tx_credits < MIN_OFLD_TLSTX_CREDITS(toep)) toep->flags |= TPF_TX_SUSPENDED; KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); txsd->plen = m->m_len; txsd->tx_credits = credits; txsd++; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { toep->txsd_pidx = 0; txsd = &toep->txsd[0]; } toep->txsd_avail--; atomic_add_long(&toep->vi->pi->tx_toe_tls_records, 1); atomic_add_long(&toep->vi->pi->tx_toe_tls_octets, m->m_len); t4_l2t_send(sc, wr, toep->l2te); } } #endif /* * For TLS data we place received mbufs received via CPL_TLS_DATA into * an mbufq in the TLS offload state. When CPL_RX_TLS_CMP is * received, the completed PDUs are placed into the socket receive * buffer. * * The TLS code reuses the ulp_pdu_reclaimq to hold the pending mbufs. */ static int do_tls_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_tls_data *cpl = mtod(m, const void *); unsigned int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp = toep->inp; struct tcpcb *tp; int len; /* XXX: Should this match do_rx_data instead? */ KASSERT(!(toep->flags & TPF_SYNQE), ("%s: toep %p claims to be a synq entry", __func__, toep)); KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); /* strip off CPL header */ m_adj(m, sizeof(*cpl)); len = m->m_pkthdr.len; atomic_add_long(&toep->vi->pi->rx_toe_tls_octets, len); KASSERT(len == G_CPL_TLS_DATA_LENGTH(be32toh(cpl->length_pkd)), ("%s: payload length mismatch", __func__)); INP_WLOCK(inp); if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) { CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", __func__, tid, len, inp->inp_flags); INP_WUNLOCK(inp); m_freem(m); return (0); } /* Save TCP sequence number. */ m->m_pkthdr.tls_tcp_seq = be32toh(cpl->seq); if (mbufq_enqueue(&toep->ulp_pdu_reclaimq, m)) { #ifdef INVARIANTS panic("Failed to queue TLS data packet"); #else printf("%s: Failed to queue TLS data packet\n", __func__); INP_WUNLOCK(inp); m_freem(m); return (0); #endif } tp = intotcpcb(inp); tp->t_rcvtime = ticks; #ifdef VERBOSE_TRACES CTR4(KTR_CXGBE, "%s: tid %u len %d seq %u", __func__, tid, len, be32toh(cpl->seq)); #endif INP_WUNLOCK(inp); return (0); } static int do_rx_tls_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_rx_tls_cmp *cpl = mtod(m, const void *); struct tlsrx_hdr_pkt *tls_hdr_pkt; unsigned int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp = toep->inp; struct tcpcb *tp; struct socket *so; struct sockbuf *sb; struct mbuf *tls_data; #ifdef KERN_TLS struct tls_get_record *tgr; struct mbuf *control; #endif int len, pdu_length, rx_credits; KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); KASSERT(!(toep->flags & TPF_SYNQE), ("%s: toep %p claims to be a synq entry", __func__, toep)); /* strip off CPL header */ m_adj(m, sizeof(*cpl)); len = m->m_pkthdr.len; atomic_add_long(&toep->vi->pi->rx_toe_tls_records, 1); KASSERT(len == G_CPL_RX_TLS_CMP_LENGTH(be32toh(cpl->pdulength_length)), ("%s: payload length mismatch", __func__)); INP_WLOCK(inp); if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) { CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", __func__, tid, len, inp->inp_flags); INP_WUNLOCK(inp); m_freem(m); return (0); } pdu_length = G_CPL_RX_TLS_CMP_PDULENGTH(be32toh(cpl->pdulength_length)); so = inp_inpcbtosocket(inp); tp = intotcpcb(inp); #ifdef VERBOSE_TRACES CTR6(KTR_CXGBE, "%s: tid %u PDU len %d len %d seq %u, rcv_nxt %u", __func__, tid, pdu_length, len, be32toh(cpl->seq), tp->rcv_nxt); #endif tp->rcv_nxt += pdu_length; - if (tp->rcv_wnd < pdu_length) { - toep->tls.rcv_over += pdu_length - tp->rcv_wnd; - tp->rcv_wnd = 0; - } else - tp->rcv_wnd -= pdu_length; + KASSERT(tp->rcv_wnd >= pdu_length, + ("%s: negative window size", __func__)); + tp->rcv_wnd -= pdu_length; /* XXX: Not sure what to do about urgent data. */ /* * The payload of this CPL is the TLS header followed by * additional fields. */ KASSERT(m->m_len >= sizeof(*tls_hdr_pkt), ("%s: payload too small", __func__)); tls_hdr_pkt = mtod(m, void *); tls_data = mbufq_dequeue(&toep->ulp_pdu_reclaimq); if (tls_data != NULL) { KASSERT(be32toh(cpl->seq) == tls_data->m_pkthdr.tls_tcp_seq, ("%s: sequence mismatch", __func__)); } #ifdef KERN_TLS if (toep->tls.mode == TLS_MODE_KTLS) { /* Report decryption errors as EBADMSG. */ if ((tls_hdr_pkt->res_to_mac_error & M_TLSRX_HDR_PKT_ERROR) != 0) { m_freem(m); m_freem(tls_data); CURVNET_SET(toep->vnet); so->so_error = EBADMSG; sorwakeup(so); INP_WUNLOCK(inp); CURVNET_RESTORE(); return (0); } /* Allocate the control message mbuf. */ control = sbcreatecontrol(NULL, sizeof(*tgr), TLS_GET_RECORD, IPPROTO_TCP); if (control == NULL) { m_freem(m); m_freem(tls_data); CURVNET_SET(toep->vnet); so->so_error = ENOBUFS; sorwakeup(so); INP_WUNLOCK(inp); CURVNET_RESTORE(); return (0); } tgr = (struct tls_get_record *) CMSG_DATA(mtod(control, struct cmsghdr *)); tgr->tls_type = tls_hdr_pkt->type; tgr->tls_vmajor = be16toh(tls_hdr_pkt->version) >> 8; tgr->tls_vminor = be16toh(tls_hdr_pkt->version) & 0xff; m_freem(m); if (tls_data != NULL) { m_last(tls_data)->m_flags |= M_EOR; tgr->tls_length = htobe16(tls_data->m_pkthdr.len); } else tgr->tls_length = 0; m = tls_data; } else #endif { /* * Only the TLS header is sent to OpenSSL, so report * errors by altering the record type. */ if ((tls_hdr_pkt->res_to_mac_error & M_TLSRX_HDR_PKT_ERROR) != 0) tls_hdr_pkt->type = CONTENT_TYPE_ERROR; /* Trim this CPL's mbuf to only include the TLS header. */ KASSERT(m->m_len == len && m->m_next == NULL, ("%s: CPL spans multiple mbufs", __func__)); m->m_len = TLS_HEADER_LENGTH; m->m_pkthdr.len = TLS_HEADER_LENGTH; if (tls_data != NULL) { /* * Update the TLS header length to be the length of * the payload data. */ tls_hdr_pkt->length = htobe16(tls_data->m_pkthdr.len); m->m_next = tls_data; m->m_pkthdr.len += tls_data->m_len; } #ifdef KERN_TLS control = NULL; #endif } sb = &so->so_rcv; SOCKBUF_LOCK(sb); if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) { struct epoch_tracker et; CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)", __func__, tid, pdu_length); m_freem(m); #ifdef KERN_TLS m_freem(control); #endif SOCKBUF_UNLOCK(sb); INP_WUNLOCK(inp); CURVNET_SET(toep->vnet); NET_EPOCH_ENTER(et); INP_WLOCK(inp); tp = tcp_drop(tp, ECONNRESET); if (tp) INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } /* * Not all of the bytes on the wire are included in the socket buffer * (e.g. the MAC of the TLS record). However, those bytes are included * in the TCP sequence space. */ /* receive buffer autosize */ MPASS(toep->vnet == so->so_vnet); CURVNET_SET(toep->vnet); if (sb->sb_flags & SB_AUTOSIZE && V_tcp_do_autorcvbuf && sb->sb_hiwat < V_tcp_autorcvbuf_max && m->m_pkthdr.len > (sbspace(sb) / 8 * 7)) { unsigned int hiwat = sb->sb_hiwat; unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc, V_tcp_autorcvbuf_max); if (!sbreserve_locked(sb, newsize, so, NULL)) sb->sb_flags &= ~SB_AUTOSIZE; } #ifdef KERN_TLS if (control != NULL) sbappendcontrol_locked(sb, m, control, 0); else #endif sbappendstream_locked(sb, m, 0); rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; #ifdef VERBOSE_TRACES CTR4(KTR_CXGBE, "%s: tid %u rx_credits %u rcv_wnd %u", __func__, tid, rx_credits, tp->rcv_wnd); #endif if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) { rx_credits = send_rx_credits(sc, toep, rx_credits); tp->rcv_wnd += rx_credits; tp->rcv_adv += rx_credits; } sorwakeup_locked(so); SOCKBUF_UNLOCK_ASSERT(sb); INP_WUNLOCK(inp); CURVNET_RESTORE(); return (0); } void t4_tls_mod_load(void) { mtx_init(&tls_handshake_lock, "t4tls handshake", NULL, MTX_DEF); t4_register_cpl_handler(CPL_TLS_DATA, do_tls_data); t4_register_cpl_handler(CPL_RX_TLS_CMP, do_rx_tls_cmp); } void t4_tls_mod_unload(void) { t4_register_cpl_handler(CPL_TLS_DATA, NULL); t4_register_cpl_handler(CPL_RX_TLS_CMP, NULL); mtx_destroy(&tls_handshake_lock); } #endif /* TCP_OFFLOAD */ Index: head/sys/dev/cxgbe/tom/t4_tls.h =================================================================== --- head/sys/dev/cxgbe/tom/t4_tls.h (revision 366853) +++ head/sys/dev/cxgbe/tom/t4_tls.h (revision 366854) @@ -1,591 +1,590 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2017-2018 Chelsio Communications, Inc. * All rights reserved. * Written by: John Baldwin , Atul Gupta * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ * */ #ifndef __T4_TLS_H__ #define __T4_TLS_H__ #define TLS1_VERSION 0x0301 #define TLS1_1_VERSION 0x0302 #define TLS1_2_VERSION 0x0303 #define TLS_MAX_VERSION TLS1_2_VERSION #define DTLS1_VERSION 0xFEFF #define DTLS1_2_VERSION 0xFEFD #define DTLS_MAX_VERSION DTLS1_2_VERSION #define DTLS1_VERSION_MAJOR 0xFE /* Custom socket options for TLS+TOE. */ #define MAX_MAC_KSZ 64 /*512 bits */ #define MAX_CIPHER_KSZ 32 /* 256 bits */ #define CIPHER_BLOCK_SZ 16 #define SALT_SIZE 4 /* Can accomodate 16, 11-15 are reserved */ enum { CHSSL_SHA_NOP, CHSSL_SHA1, CHSSL_SHA224, CHSSL_SHA256, CHSSL_GHASH, CHSSL_SHA512_224, CHSSL_SHA512_256, CHSSL_SHA512_384, CHSSL_SHA512_512, CHSSL_CBCMAC, CHSSL_CMAC, }; /* Can accomodate 16, 8-15 are reserved */ enum { CHSSL_CIPH_NOP, CHSSL_AES_CBC, CHSSL_AES_GCM, CHSSL_AES_CTR, CHSSL_AES_GEN, CHSSL_IPSEC_ESP, CHSSL_AES_XTS, CHSSL_AES_CCM, }; /* Key Context Programming Operation type */ #define KEY_WRITE_RX 0x1 #define KEY_WRITE_TX 0x2 #define KEY_DELETE_RX 0x4 #define KEY_DELETE_TX 0x8 #define S_KEY_CLR_LOC 4 #define M_KEY_CLR_LOC 0xf #define V_KEY_CLR_LOC(x) ((x) << S_KEY_CLR_LOC) #define G_KEY_CLR_LOC(x) (((x) >> S_KEY_CLR_LOC) & M_KEY_CLR_LOC) #define F_KEY_CLR_LOC V_KEY_CLR_LOC(1U) #define S_KEY_GET_LOC 0 #define M_KEY_GET_LOC 0xf #define V_KEY_GET_LOC(x) ((x) << S_KEY_GET_LOC) #define G_KEY_GET_LOC(x) (((x) >> S_KEY_GET_LOC) & M_KEY_GET_LOC) struct tls_ofld_state { unsigned char enc_mode; unsigned char mac_mode; unsigned char key_loc; unsigned char ofld_mode; unsigned char auth_mode; unsigned char resv[3]; }; struct tls_tx_ctxt { unsigned char salt[SALT_SIZE]; unsigned char key[MAX_CIPHER_KSZ]; unsigned char ipad[MAX_MAC_KSZ]; unsigned char opad[MAX_MAC_KSZ]; }; struct tls_rx_ctxt { unsigned char salt[SALT_SIZE]; unsigned char key[MAX_CIPHER_KSZ]; unsigned char ipad[MAX_MAC_KSZ]; unsigned char opad[MAX_MAC_KSZ]; }; struct tls_key_context { struct tls_tx_ctxt tx; struct tls_rx_ctxt rx; unsigned char l_p_key; unsigned char hmac_ctrl; unsigned char mac_first; unsigned char iv_size; unsigned char iv_ctrl; unsigned char iv_algo; unsigned char tx_seq_no; unsigned char rx_seq_no; struct tls_ofld_state state; unsigned int tx_key_info_size; unsigned int rx_key_info_size; unsigned int frag_size; unsigned int mac_secret_size; unsigned int cipher_secret_size; int proto_ver; unsigned int sock_fd; unsigned short dtls_epoch; unsigned short rsv; }; /* Set with 'struct tls_key_context'. */ #define TCP_TLSOM_SET_TLS_CONTEXT (TCP_VENDOR) /* Get returns int of enabled (1) / disabled (0). */ #define TCP_TLSOM_GET_TLS_TOM (TCP_VENDOR + 1) enum { TLS_TOM_NONE = 0, TLS_TOM_TXONLY, TLS_TOM_BOTH }; /* Set with no value. */ #define TCP_TLSOM_CLR_TLS_TOM (TCP_VENDOR + 2) /* Set with no value. */ #define TCP_TLSOM_CLR_QUIES (TCP_VENDOR + 3) #ifdef _KERNEL /* Timeouts for handshake timer in seconds. */ #define TLS_SRV_HELLO_DONE 9 #define TLS_SRV_HELLO_RD_TM 5 #define TLS_SRV_HELLO_BKOFF_TM 15 #define CONTENT_TYPE_CCS 20 #define CONTENT_TYPE_ALERT 21 #define CONTENT_TYPE_HANDSHAKE 22 #define CONTENT_TYPE_APP_DATA 23 #define CONTENT_TYPE_HEARTBEAT 24 #define CONTENT_TYPE_KEY_CONTEXT 32 #define CONTENT_TYPE_ERROR 127 #define GCM_TAG_SIZE 16 #define AEAD_EXPLICIT_DATA_SIZE 8 #define TLS_HEADER_LENGTH 5 #define TP_TX_PG_SZ 65536 #define FC_TP_PLEN_MAX 17408 #define IPAD_SIZE 64 #define OPAD_SIZE 64 #define KEY_SIZE 32 #define CIPHER_BLOCK_SIZE 16 #define HDR_KCTX_SIZE (IPAD_SIZE + OPAD_SIZE + KEY_SIZE) #define KEY_IN_DDR_SIZE 16 #define TLS_KEY_CONTEXT_SZ roundup2(sizeof(struct tls_tx_ctxt), 32) /* MAC KEY SIZE */ #define SHA_NOP 0 #define SHA_GHASH 16 #define SHA_224 28 #define SHA_256 32 #define SHA_384 48 #define SHA_512 64 #define SHA1 20 /* CIPHER KEY SIZE */ #define AES_NOP 0 #define AES_128 16 #define AES_192 24 #define AES_256 32 enum { TLS_1_2_VERSION, TLS_1_1_VERSION, DTLS_1_2_VERSION, TLS_VERSION_MAX, }; enum { CH_EVP_CIPH_STREAM_CIPHER, CH_EVP_CIPH_CBC_MODE, CH_EVP_CIPH_GCM_MODE, CH_EVP_CIPH_CTR_MODE, }; enum { TLS_SFO_WR_CONTEXTLOC_DSGL, TLS_SFO_WR_CONTEXTLOC_IMMEDIATE, TLS_SFO_WR_CONTEXTLOC_DDR, }; enum { CPL_TX_TLS_SFO_TYPE_CCS, CPL_TX_TLS_SFO_TYPE_ALERT, CPL_TX_TLS_SFO_TYPE_HANDSHAKE, CPL_TX_TLS_SFO_TYPE_DATA, CPL_TX_TLS_SFO_TYPE_HEARTBEAT, /* XXX: Shouldn't this be "CUSTOM"? */ }; enum { CH_CK_SIZE_128, CH_CK_SIZE_192, CH_CK_SIZE_256, CH_CK_SIZE_NOP, }; enum { CH_MK_SIZE_128, CH_MK_SIZE_160, CH_MK_SIZE_192, CH_MK_SIZE_256, CH_MK_SIZE_512, CH_MK_SIZE_NOP, }; struct tls_scmd { __be32 seqno_numivs; __be32 ivgen_hdrlen; }; enum tls_mode { TLS_MODE_OFF, TLS_MODE_TLSOM, TLS_MODE_KTLS, }; struct tls_ofld_info { struct tls_key_context k_ctx; int key_location; int mac_length; int rx_key_addr; int tx_key_addr; uint64_t tx_seq_no; unsigned short fcplenmax; unsigned short adjusted_plen; unsigned short expn_per_ulp; unsigned short pdus_per_ulp; struct tls_scmd scmd0; u_int iv_len; enum tls_mode mode; struct callout handshake_timer; u_int sb_off; - u_int rcv_over; }; struct tls_key_req { __be32 wr_hi; __be32 wr_mid; __be32 ftid; __u8 reneg_to_write_rx; __u8 protocol; __be16 mfs; /* master command */ __be32 cmd; __be32 len16; /* command length */ __be32 dlen; /* data length in 32-byte units */ __be32 kaddr; /* sub-command */ __be32 sc_more; __be32 sc_len; }__packed; struct tls_keyctx { union key_ctx { struct tx_keyctx_hdr { __u8 ctxlen; __u8 r2; __be16 dualck_to_txvalid; __u8 txsalt[4]; __be64 r5; } txhdr; struct rx_keyctx_hdr { __u8 flitcnt_hmacctrl; __u8 protover_ciphmode; __u8 authmode_to_rxvalid; __u8 ivpresent_to_rxmk_size; __u8 rxsalt[4]; __be64 ivinsert_to_authinsrt; } rxhdr; } u; struct keys { __u8 edkey[32]; __u8 ipad[64]; __u8 opad[64]; } keys; }; #define S_TLS_KEYCTX_TX_WR_DUALCK 12 #define M_TLS_KEYCTX_TX_WR_DUALCK 0x1 #define V_TLS_KEYCTX_TX_WR_DUALCK(x) ((x) << S_TLS_KEYCTX_TX_WR_DUALCK) #define G_TLS_KEYCTX_TX_WR_DUALCK(x) \ (((x) >> S_TLS_KEYCTX_TX_WR_DUALCK) & M_TLS_KEYCTX_TX_WR_DUALCK) #define F_TLS_KEYCTX_TX_WR_DUALCK V_TLS_KEYCTX_TX_WR_DUALCK(1U) #define S_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT 11 #define M_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT 0x1 #define V_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT(x) \ ((x) << S_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT) #define G_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT(x) \ (((x) >> S_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT) & \ M_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT) #define F_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT \ V_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT(1U) #define S_TLS_KEYCTX_TX_WR_SALT_PRESENT 10 #define M_TLS_KEYCTX_TX_WR_SALT_PRESENT 0x1 #define V_TLS_KEYCTX_TX_WR_SALT_PRESENT(x) \ ((x) << S_TLS_KEYCTX_TX_WR_SALT_PRESENT) #define G_TLS_KEYCTX_TX_WR_SALT_PRESENT(x) \ (((x) >> S_TLS_KEYCTX_TX_WR_SALT_PRESENT) & \ M_TLS_KEYCTX_TX_WR_SALT_PRESENT) #define F_TLS_KEYCTX_TX_WR_SALT_PRESENT \ V_TLS_KEYCTX_TX_WR_SALT_PRESENT(1U) #define S_TLS_KEYCTX_TX_WR_TXCK_SIZE 6 #define M_TLS_KEYCTX_TX_WR_TXCK_SIZE 0xf #define V_TLS_KEYCTX_TX_WR_TXCK_SIZE(x) \ ((x) << S_TLS_KEYCTX_TX_WR_TXCK_SIZE) #define G_TLS_KEYCTX_TX_WR_TXCK_SIZE(x) \ (((x) >> S_TLS_KEYCTX_TX_WR_TXCK_SIZE) & \ M_TLS_KEYCTX_TX_WR_TXCK_SIZE) #define S_TLS_KEYCTX_TX_WR_TXMK_SIZE 2 #define M_TLS_KEYCTX_TX_WR_TXMK_SIZE 0xf #define V_TLS_KEYCTX_TX_WR_TXMK_SIZE(x) \ ((x) << S_TLS_KEYCTX_TX_WR_TXMK_SIZE) #define G_TLS_KEYCTX_TX_WR_TXMK_SIZE(x) \ (((x) >> S_TLS_KEYCTX_TX_WR_TXMK_SIZE) & \ M_TLS_KEYCTX_TX_WR_TXMK_SIZE) #define S_TLS_KEYCTX_TX_WR_TXVALID 0 #define M_TLS_KEYCTX_TX_WR_TXVALID 0x1 #define V_TLS_KEYCTX_TX_WR_TXVALID(x) \ ((x) << S_TLS_KEYCTX_TX_WR_TXVALID) #define G_TLS_KEYCTX_TX_WR_TXVALID(x) \ (((x) >> S_TLS_KEYCTX_TX_WR_TXVALID) & M_TLS_KEYCTX_TX_WR_TXVALID) #define F_TLS_KEYCTX_TX_WR_TXVALID V_TLS_KEYCTX_TX_WR_TXVALID(1U) #define S_TLS_KEYCTX_TX_WR_FLITCNT 3 #define M_TLS_KEYCTX_TX_WR_FLITCNT 0x1f #define V_TLS_KEYCTX_TX_WR_FLITCNT(x) \ ((x) << S_TLS_KEYCTX_TX_WR_FLITCNT) #define G_TLS_KEYCTX_TX_WR_FLITCNT(x) \ (((x) >> S_TLS_KEYCTX_TX_WR_FLITCNT) & M_TLS_KEYCTX_TX_WR_FLITCNT) #define S_TLS_KEYCTX_TX_WR_HMACCTRL 0 #define M_TLS_KEYCTX_TX_WR_HMACCTRL 0x7 #define V_TLS_KEYCTX_TX_WR_HMACCTRL(x) \ ((x) << S_TLS_KEYCTX_TX_WR_HMACCTRL) #define G_TLS_KEYCTX_TX_WR_HMACCTRL(x) \ (((x) >> S_TLS_KEYCTX_TX_WR_HMACCTRL) & M_TLS_KEYCTX_TX_WR_HMACCTRL) #define S_TLS_KEYCTX_TX_WR_PROTOVER 4 #define M_TLS_KEYCTX_TX_WR_PROTOVER 0xf #define V_TLS_KEYCTX_TX_WR_PROTOVER(x) \ ((x) << S_TLS_KEYCTX_TX_WR_PROTOVER) #define G_TLS_KEYCTX_TX_WR_PROTOVER(x) \ (((x) >> S_TLS_KEYCTX_TX_WR_PROTOVER) & M_TLS_KEYCTX_TX_WR_PROTOVER) #define S_TLS_KEYCTX_TX_WR_CIPHMODE 0 #define M_TLS_KEYCTX_TX_WR_CIPHMODE 0xf #define V_TLS_KEYCTX_TX_WR_CIPHMODE(x) \ ((x) << S_TLS_KEYCTX_TX_WR_CIPHMODE) #define G_TLS_KEYCTX_TX_WR_CIPHMODE(x) \ (((x) >> S_TLS_KEYCTX_TX_WR_CIPHMODE) & M_TLS_KEYCTX_TX_WR_CIPHMODE) #define S_TLS_KEYCTX_TX_WR_AUTHMODE 4 #define M_TLS_KEYCTX_TX_WR_AUTHMODE 0xf #define V_TLS_KEYCTX_TX_WR_AUTHMODE(x) \ ((x) << S_TLS_KEYCTX_TX_WR_AUTHMODE) #define G_TLS_KEYCTX_TX_WR_AUTHMODE(x) \ (((x) >> S_TLS_KEYCTX_TX_WR_AUTHMODE) & M_TLS_KEYCTX_TX_WR_AUTHMODE) #define S_TLS_KEYCTX_TX_WR_CIPHAUTHSEQCTRL 3 #define M_TLS_KEYCTX_TX_WR_CIPHAUTHSEQCTRL 0x1 #define V_TLS_KEYCTX_TX_WR_CIPHAUTHSEQCTRL(x) \ ((x) << S_TLS_KEYCTX_TX_WR_CIPHAUTHSEQCTRL) #define G_TLS_KEYCTX_TX_WR_CIPHAUTHSEQCTRL(x) \ (((x) >> S_TLS_KEYCTX_TX_WR_CIPHAUTHSEQCTRL) & \ M_TLS_KEYCTX_TX_WR_CIPHAUTHSEQCTRL) #define F_TLS_KEYCTX_TX_WR_CIPHAUTHSEQCTRL \ V_TLS_KEYCTX_TX_WR_CIPHAUTHSEQCTRL(1U) #define S_TLS_KEYCTX_TX_WR_SEQNUMCTRL 1 #define M_TLS_KEYCTX_TX_WR_SEQNUMCTRL 0x3 #define V_TLS_KEYCTX_TX_WR_SEQNUMCTRL(x) \ ((x) << S_TLS_KEYCTX_TX_WR_SEQNUMCTRL) #define G_TLS_KEYCTX_TX_WR_SEQNUMCTRL(x) \ (((x) >> S_TLS_KEYCTX_TX_WR_SEQNUMCTRL) & \ M_TLS_KEYCTX_TX_WR_SEQNUMCTRL) #define S_TLS_KEYCTX_TX_WR_RXVALID 0 #define M_TLS_KEYCTX_TX_WR_RXVALID 0x1 #define V_TLS_KEYCTX_TX_WR_RXVALID(x) \ ((x) << S_TLS_KEYCTX_TX_WR_RXVALID) #define G_TLS_KEYCTX_TX_WR_RXVALID(x) \ (((x) >> S_TLS_KEYCTX_TX_WR_RXVALID) & M_TLS_KEYCTX_TX_WR_RXVALID) #define F_TLS_KEYCTX_TX_WR_RXVALID V_TLS_KEYCTX_TX_WR_RXVALID(1U) #define S_TLS_KEYCTX_TX_WR_IVPRESENT 7 #define M_TLS_KEYCTX_TX_WR_IVPRESENT 0x1 #define V_TLS_KEYCTX_TX_WR_IVPRESENT(x) \ ((x) << S_TLS_KEYCTX_TX_WR_IVPRESENT) #define G_TLS_KEYCTX_TX_WR_IVPRESENT(x) \ (((x) >> S_TLS_KEYCTX_TX_WR_IVPRESENT) & \ M_TLS_KEYCTX_TX_WR_IVPRESENT) #define F_TLS_KEYCTX_TX_WR_IVPRESENT V_TLS_KEYCTX_TX_WR_IVPRESENT(1U) #define S_TLS_KEYCTX_TX_WR_RXOPAD_PRESENT 6 #define M_TLS_KEYCTX_TX_WR_RXOPAD_PRESENT 0x1 #define V_TLS_KEYCTX_TX_WR_RXOPAD_PRESENT(x) \ ((x) << S_TLS_KEYCTX_TX_WR_RXOPAD_PRESENT) #define G_TLS_KEYCTX_TX_WR_RXOPAD_PRESENT(x) \ (((x) >> S_TLS_KEYCTX_TX_WR_RXOPAD_PRESENT) & \ M_TLS_KEYCTX_TX_WR_RXOPAD_PRESENT) #define F_TLS_KEYCTX_TX_WR_RXOPAD_PRESENT \ V_TLS_KEYCTX_TX_WR_RXOPAD_PRESENT(1U) #define S_TLS_KEYCTX_TX_WR_RXCK_SIZE 3 #define M_TLS_KEYCTX_TX_WR_RXCK_SIZE 0x7 #define V_TLS_KEYCTX_TX_WR_RXCK_SIZE(x) \ ((x) << S_TLS_KEYCTX_TX_WR_RXCK_SIZE) #define G_TLS_KEYCTX_TX_WR_RXCK_SIZE(x) \ (((x) >> S_TLS_KEYCTX_TX_WR_RXCK_SIZE) & \ M_TLS_KEYCTX_TX_WR_RXCK_SIZE) #define S_TLS_KEYCTX_TX_WR_RXMK_SIZE 0 #define M_TLS_KEYCTX_TX_WR_RXMK_SIZE 0x7 #define V_TLS_KEYCTX_TX_WR_RXMK_SIZE(x) \ ((x) << S_TLS_KEYCTX_TX_WR_RXMK_SIZE) #define G_TLS_KEYCTX_TX_WR_RXMK_SIZE(x) \ (((x) >> S_TLS_KEYCTX_TX_WR_RXMK_SIZE) & \ M_TLS_KEYCTX_TX_WR_RXMK_SIZE) #define S_TLS_KEYCTX_TX_WR_IVINSERT 55 #define M_TLS_KEYCTX_TX_WR_IVINSERT 0x1ffULL #define V_TLS_KEYCTX_TX_WR_IVINSERT(x) \ ((x) << S_TLS_KEYCTX_TX_WR_IVINSERT) #define G_TLS_KEYCTX_TX_WR_IVINSERT(x) \ (((x) >> S_TLS_KEYCTX_TX_WR_IVINSERT) & M_TLS_KEYCTX_TX_WR_IVINSERT) #define S_TLS_KEYCTX_TX_WR_AADSTRTOFST 47 #define M_TLS_KEYCTX_TX_WR_AADSTRTOFST 0xffULL #define V_TLS_KEYCTX_TX_WR_AADSTRTOFST(x) \ ((x) << S_TLS_KEYCTX_TX_WR_AADSTRTOFST) #define G_TLS_KEYCTX_TX_WR_AADSTRTOFST(x) \ (((x) >> S_TLS_KEYCTX_TX_WR_AADSTRTOFST) & \ M_TLS_KEYCTX_TX_WR_AADSTRTOFST) #define S_TLS_KEYCTX_TX_WR_AADSTOPOFST 39 #define M_TLS_KEYCTX_TX_WR_AADSTOPOFST 0xffULL #define V_TLS_KEYCTX_TX_WR_AADSTOPOFST(x) \ ((x) << S_TLS_KEYCTX_TX_WR_AADSTOPOFST) #define G_TLS_KEYCTX_TX_WR_AADSTOPOFST(x) \ (((x) >> S_TLS_KEYCTX_TX_WR_AADSTOPOFST) & \ M_TLS_KEYCTX_TX_WR_AADSTOPOFST) #define S_TLS_KEYCTX_TX_WR_CIPHERSRTOFST 30 #define M_TLS_KEYCTX_TX_WR_CIPHERSRTOFST 0x1ffULL #define V_TLS_KEYCTX_TX_WR_CIPHERSRTOFST(x) \ ((x) << S_TLS_KEYCTX_TX_WR_CIPHERSRTOFST) #define G_TLS_KEYCTX_TX_WR_CIPHERSRTOFST(x) \ (((x) >> S_TLS_KEYCTX_TX_WR_CIPHERSRTOFST) & \ M_TLS_KEYCTX_TX_WR_CIPHERSRTOFST) #define S_TLS_KEYCTX_TX_WR_CIPHERSTOPOFST 23 #define M_TLS_KEYCTX_TX_WR_CIPHERSTOPOFST 0x7f #define V_TLS_KEYCTX_TX_WR_CIPHERSTOPOFST(x) \ ((x) << S_TLS_KEYCTX_TX_WR_CIPHERSTOPOFST) #define G_TLS_KEYCTX_TX_WR_CIPHERSTOPOFST(x) \ (((x) >> S_TLS_KEYCTX_TX_WR_CIPHERSTOPOFST) & \ M_TLS_KEYCTX_TX_WR_CIPHERSTOPOFST) #define S_TLS_KEYCTX_TX_WR_AUTHSRTOFST 14 #define M_TLS_KEYCTX_TX_WR_AUTHSRTOFST 0x1ff #define V_TLS_KEYCTX_TX_WR_AUTHSRTOFST(x) \ ((x) << S_TLS_KEYCTX_TX_WR_AUTHSRTOFST) #define G_TLS_KEYCTX_TX_WR_AUTHSRTOFST(x) \ (((x) >> S_TLS_KEYCTX_TX_WR_AUTHSRTOFST) & \ M_TLS_KEYCTX_TX_WR_AUTHSRTOFST) #define S_TLS_KEYCTX_TX_WR_AUTHSTOPOFST 7 #define M_TLS_KEYCTX_TX_WR_AUTHSTOPOFST 0x7f #define V_TLS_KEYCTX_TX_WR_AUTHSTOPOFST(x) \ ((x) << S_TLS_KEYCTX_TX_WR_AUTHSTOPOFST) #define G_TLS_KEYCTX_TX_WR_AUTHSTOPOFST(x) \ (((x) >> S_TLS_KEYCTX_TX_WR_AUTHSTOPOFST) & \ M_TLS_KEYCTX_TX_WR_AUTHSTOPOFST) #define S_TLS_KEYCTX_TX_WR_AUTHINSRT 0 #define M_TLS_KEYCTX_TX_WR_AUTHINSRT 0x7f #define V_TLS_KEYCTX_TX_WR_AUTHINSRT(x) \ ((x) << S_TLS_KEYCTX_TX_WR_AUTHINSRT) #define G_TLS_KEYCTX_TX_WR_AUTHINSRT(x) \ (((x) >> S_TLS_KEYCTX_TX_WR_AUTHINSRT) & \ M_TLS_KEYCTX_TX_WR_AUTHINSRT) struct tls_hdr { __u8 type; __be16 version; __be16 length; } __packed; struct tlsrx_hdr_pkt { __u8 type; __be16 version; __be16 length; __be64 tls_seq; __be16 reserved1; __u8 res_to_mac_error; } __packed; /* res_to_mac_error fields */ #define S_TLSRX_HDR_PKT_INTERNAL_ERROR 4 #define M_TLSRX_HDR_PKT_INTERNAL_ERROR 0x1 #define V_TLSRX_HDR_PKT_INTERNAL_ERROR(x) \ ((x) << S_TLSRX_HDR_PKT_INTERNAL_ERROR) #define G_TLSRX_HDR_PKT_INTERNAL_ERROR(x) \ (((x) >> S_TLSRX_HDR_PKT_INTERNAL_ERROR) & M_TLSRX_HDR_PKT_INTERNAL_ERROR) #define F_TLSRX_HDR_PKT_INTERNAL_ERROR V_TLSRX_HDR_PKT_INTERNAL_ERROR(1U) #define S_TLSRX_HDR_PKT_SPP_ERROR 3 #define M_TLSRX_HDR_PKT_SPP_ERROR 0x1 #define V_TLSRX_HDR_PKT_SPP_ERROR(x) ((x) << S_TLSRX_HDR_PKT_SPP_ERROR) #define G_TLSRX_HDR_PKT_SPP_ERROR(x) \ (((x) >> S_TLSRX_HDR_PKT_SPP_ERROR) & M_TLSRX_HDR_PKT_SPP_ERROR) #define F_TLSRX_HDR_PKT_SPP_ERROR V_TLSRX_HDR_PKT_SPP_ERROR(1U) #define S_TLSRX_HDR_PKT_CCDX_ERROR 2 #define M_TLSRX_HDR_PKT_CCDX_ERROR 0x1 #define V_TLSRX_HDR_PKT_CCDX_ERROR(x) ((x) << S_TLSRX_HDR_PKT_CCDX_ERROR) #define G_TLSRX_HDR_PKT_CCDX_ERROR(x) \ (((x) >> S_TLSRX_HDR_PKT_CCDX_ERROR) & M_TLSRX_HDR_PKT_CCDX_ERROR) #define F_TLSRX_HDR_PKT_CCDX_ERROR V_TLSRX_HDR_PKT_CCDX_ERROR(1U) #define S_TLSRX_HDR_PKT_PAD_ERROR 1 #define M_TLSRX_HDR_PKT_PAD_ERROR 0x1 #define V_TLSRX_HDR_PKT_PAD_ERROR(x) ((x) << S_TLSRX_HDR_PKT_PAD_ERROR) #define G_TLSRX_HDR_PKT_PAD_ERROR(x) \ (((x) >> S_TLSRX_HDR_PKT_PAD_ERROR) & M_TLSRX_HDR_PKT_PAD_ERROR) #define F_TLSRX_HDR_PKT_PAD_ERROR V_TLSRX_HDR_PKT_PAD_ERROR(1U) #define S_TLSRX_HDR_PKT_MAC_ERROR 0 #define M_TLSRX_HDR_PKT_MAC_ERROR 0x1 #define V_TLSRX_HDR_PKT_MAC_ERROR(x) ((x) << S_TLSRX_HDR_PKT_MAC_ERROR) #define G_TLSRX_HDR_PKT_MAC_ERROR(x) \ (((x) >> S_TLSRX_HDR_PKT_MAC_ERROR) & M_TLSRX_HDR_PKT_MAC_ERROR) #define F_TLSRX_HDR_PKT_MAC_ERROR V_TLSRX_HDR_PKT_MAC_ERROR(1U) #define M_TLSRX_HDR_PKT_ERROR 0x1F #endif /* _KERNEL */ #endif /* !__T4_TLS_H__ */ Index: head/sys/dev/cxgbe/tom/t4_tom.c =================================================================== --- head/sys/dev/cxgbe/tom/t4_tom.c (revision 366853) +++ head/sys/dev/cxgbe/tom/t4_tom.c (revision 366854) @@ -1,1938 +1,1936 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_ratelimit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include #include #include #include #include #ifdef TCP_OFFLOAD #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "common/t4_tcb.h" #include "t4_clip.h" #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" #include "tom/t4_tls.h" static struct protosw toe_protosw; static struct pr_usrreqs toe_usrreqs; static struct protosw toe6_protosw; static struct pr_usrreqs toe6_usrreqs; /* Module ops */ static int t4_tom_mod_load(void); static int t4_tom_mod_unload(void); static int t4_tom_modevent(module_t, int, void *); /* ULD ops and helpers */ static int t4_tom_activate(struct adapter *); static int t4_tom_deactivate(struct adapter *); static struct uld_info tom_uld_info = { .uld_id = ULD_TOM, .activate = t4_tom_activate, .deactivate = t4_tom_deactivate, }; static void release_offload_resources(struct toepcb *); static int alloc_tid_tabs(struct tid_info *); static void free_tid_tabs(struct tid_info *); static void free_tom_data(struct adapter *, struct tom_data *); static void reclaim_wr_resources(void *, int); struct toepcb * alloc_toepcb(struct vi_info *vi, int flags) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct toepcb *toep; int tx_credits, txsd_total, len; /* * The firmware counts tx work request credits in units of 16 bytes * each. Reserve room for an ABORT_REQ so the driver never has to worry * about tx credits if it wants to abort a connection. */ tx_credits = sc->params.ofldq_wr_cred; tx_credits -= howmany(sizeof(struct cpl_abort_req), 16); /* * Shortest possible tx work request is a fw_ofld_tx_data_wr + 1 byte * immediate payload, and firmware counts tx work request credits in * units of 16 byte. Calculate the maximum work requests possible. */ txsd_total = tx_credits / howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16); len = offsetof(struct toepcb, txsd) + txsd_total * sizeof(struct ofld_tx_sdesc); toep = malloc(len, M_CXGBE, M_ZERO | flags); if (toep == NULL) return (NULL); refcount_init(&toep->refcount, 1); toep->td = sc->tom_softc; toep->vi = vi; toep->tid = -1; toep->tx_total = tx_credits; toep->tx_credits = tx_credits; mbufq_init(&toep->ulp_pduq, INT_MAX); mbufq_init(&toep->ulp_pdu_reclaimq, INT_MAX); toep->txsd_total = txsd_total; toep->txsd_avail = txsd_total; toep->txsd_pidx = 0; toep->txsd_cidx = 0; aiotx_init_toep(toep); return (toep); } /* * Initialize a toepcb after its params have been filled out. */ int init_toepcb(struct vi_info *vi, struct toepcb *toep) { struct conn_params *cp = &toep->params; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct tx_cl_rl_params *tc; if (cp->tc_idx >= 0 && cp->tc_idx < sc->chip_params->nsched_cls) { tc = &pi->sched_params->cl_rl[cp->tc_idx]; mtx_lock(&sc->tc_lock); if (tc->flags & CLRL_ERR) { log(LOG_ERR, "%s: failed to associate traffic class %u with tid %u\n", device_get_nameunit(vi->dev), cp->tc_idx, toep->tid); cp->tc_idx = -1; } else { tc->refcount++; } mtx_unlock(&sc->tc_lock); } toep->ofld_txq = &sc->sge.ofld_txq[cp->txq_idx]; toep->ofld_rxq = &sc->sge.ofld_rxq[cp->rxq_idx]; toep->ctrlq = &sc->sge.ctrlq[pi->port_id]; tls_init_toep(toep); if (ulp_mode(toep) == ULP_MODE_TCPDDP) ddp_init_toep(toep); toep->flags |= TPF_INITIALIZED; return (0); } struct toepcb * hold_toepcb(struct toepcb *toep) { refcount_acquire(&toep->refcount); return (toep); } void free_toepcb(struct toepcb *toep) { if (refcount_release(&toep->refcount) == 0) return; KASSERT(!(toep->flags & TPF_ATTACHED), ("%s: attached to an inpcb", __func__)); KASSERT(!(toep->flags & TPF_CPL_PENDING), ("%s: CPL pending", __func__)); if (toep->flags & TPF_INITIALIZED) { if (ulp_mode(toep) == ULP_MODE_TCPDDP) ddp_uninit_toep(toep); tls_uninit_toep(toep); } free(toep, M_CXGBE); } /* * Set up the socket for TCP offload. */ void offload_socket(struct socket *so, struct toepcb *toep) { struct tom_data *td = toep->td; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct sockbuf *sb; INP_WLOCK_ASSERT(inp); /* Update socket */ sb = &so->so_snd; SOCKBUF_LOCK(sb); sb->sb_flags |= SB_NOCOALESCE; SOCKBUF_UNLOCK(sb); sb = &so->so_rcv; SOCKBUF_LOCK(sb); sb->sb_flags |= SB_NOCOALESCE; if (inp->inp_vflag & INP_IPV6) so->so_proto = &toe6_protosw; else so->so_proto = &toe_protosw; SOCKBUF_UNLOCK(sb); /* Update TCP PCB */ tp->tod = &td->tod; tp->t_toe = toep; tp->t_flags |= TF_TOE; /* Install an extra hold on inp */ toep->inp = inp; toep->flags |= TPF_ATTACHED; in_pcbref(inp); /* Add the TOE PCB to the active list */ mtx_lock(&td->toep_list_lock); TAILQ_INSERT_HEAD(&td->toep_list, toep, link); mtx_unlock(&td->toep_list_lock); } /* This is _not_ the normal way to "unoffload" a socket. */ void undo_offload_socket(struct socket *so) { struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct toepcb *toep = tp->t_toe; struct tom_data *td = toep->td; struct sockbuf *sb; INP_WLOCK_ASSERT(inp); sb = &so->so_snd; SOCKBUF_LOCK(sb); sb->sb_flags &= ~SB_NOCOALESCE; SOCKBUF_UNLOCK(sb); sb = &so->so_rcv; SOCKBUF_LOCK(sb); sb->sb_flags &= ~SB_NOCOALESCE; SOCKBUF_UNLOCK(sb); tp->tod = NULL; tp->t_toe = NULL; tp->t_flags &= ~TF_TOE; toep->inp = NULL; toep->flags &= ~TPF_ATTACHED; if (in_pcbrele_wlocked(inp)) panic("%s: inp freed.", __func__); mtx_lock(&td->toep_list_lock); TAILQ_REMOVE(&td->toep_list, toep, link); mtx_unlock(&td->toep_list_lock); } static void release_offload_resources(struct toepcb *toep) { struct tom_data *td = toep->td; struct adapter *sc = td_adapter(td); int tid = toep->tid; KASSERT(!(toep->flags & TPF_CPL_PENDING), ("%s: %p has CPL pending.", __func__, toep)); KASSERT(!(toep->flags & TPF_ATTACHED), ("%s: %p is still attached.", __func__, toep)); CTR5(KTR_CXGBE, "%s: toep %p (tid %d, l2te %p, ce %p)", __func__, toep, tid, toep->l2te, toep->ce); /* * These queues should have been emptied at approximately the same time * that a normal connection's socket's so_snd would have been purged or * drained. Do _not_ clean up here. */ MPASS(mbufq_len(&toep->ulp_pduq) == 0); MPASS(mbufq_len(&toep->ulp_pdu_reclaimq) == 0); #ifdef INVARIANTS if (ulp_mode(toep) == ULP_MODE_TCPDDP) ddp_assert_empty(toep); #endif MPASS(TAILQ_EMPTY(&toep->aiotx_jobq)); if (toep->l2te) t4_l2t_release(toep->l2te); if (tid >= 0) { remove_tid(sc, tid, toep->ce ? 2 : 1); release_tid(sc, tid, toep->ctrlq); } if (toep->ce) t4_release_lip(sc, toep->ce); if (toep->params.tc_idx != -1) t4_release_cl_rl(sc, toep->vi->pi->port_id, toep->params.tc_idx); mtx_lock(&td->toep_list_lock); TAILQ_REMOVE(&td->toep_list, toep, link); mtx_unlock(&td->toep_list_lock); free_toepcb(toep); } /* * The kernel is done with the TCP PCB and this is our opportunity to unhook the * toepcb hanging off of it. If the TOE driver is also done with the toepcb (no * pending CPL) then it is time to release all resources tied to the toepcb. * * Also gets called when an offloaded active open fails and the TOM wants the * kernel to take the TCP PCB back. */ static void t4_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp) { #if defined(KTR) || defined(INVARIANTS) struct inpcb *inp = tp->t_inpcb; #endif struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(inp); KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); KASSERT(toep->flags & TPF_ATTACHED, ("%s: not attached", __func__)); #ifdef KTR if (tp->t_state == TCPS_SYN_SENT) { CTR6(KTR_CXGBE, "%s: atid %d, toep %p (0x%x), inp %p (0x%x)", __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags); } else { CTR6(KTR_CXGBE, "t4_pcb_detach: tid %d (%s), toep %p (0x%x), inp %p (0x%x)", toep->tid, tcpstates[tp->t_state], toep, toep->flags, inp, inp->inp_flags); } #endif tp->t_toe = NULL; tp->t_flags &= ~TF_TOE; toep->flags &= ~TPF_ATTACHED; if (!(toep->flags & TPF_CPL_PENDING)) release_offload_resources(toep); } /* * setsockopt handler. */ static void t4_ctloutput(struct toedev *tod, struct tcpcb *tp, int dir, int name) { struct adapter *sc = tod->tod_softc; struct toepcb *toep = tp->t_toe; if (dir == SOPT_GET) return; CTR4(KTR_CXGBE, "%s: tp %p, dir %u, name %u", __func__, tp, dir, name); switch (name) { case TCP_NODELAY: if (tp->t_state != TCPS_ESTABLISHED) break; toep->params.nagle = tp->t_flags & TF_NODELAY ? 0 : 1; t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS, V_TF_NAGLE(1), V_TF_NAGLE(toep->params.nagle), 0, 0); break; default: break; } } static inline uint64_t get_tcb_tflags(const uint64_t *tcb) { return ((be64toh(tcb[14]) << 32) | (be64toh(tcb[15]) >> 32)); } static inline uint32_t get_tcb_field(const uint64_t *tcb, u_int word, uint32_t mask, u_int shift) { #define LAST_WORD ((TCB_SIZE / 4) - 1) uint64_t t1, t2; int flit_idx; MPASS(mask != 0); MPASS(word <= LAST_WORD); MPASS(shift < 32); flit_idx = (LAST_WORD - word) / 2; if (word & 0x1) shift += 32; t1 = be64toh(tcb[flit_idx]) >> shift; t2 = 0; if (fls(mask) > 64 - shift) { /* * Will spill over into the next logical flit, which is the flit * before this one. The flit_idx before this one must be valid. */ MPASS(flit_idx > 0); t2 = be64toh(tcb[flit_idx - 1]) << (64 - shift); } return ((t2 | t1) & mask); #undef LAST_WORD } #define GET_TCB_FIELD(tcb, F) \ get_tcb_field(tcb, W_TCB_##F, M_TCB_##F, S_TCB_##F) /* * Issues a CPL_GET_TCB to read the entire TCB for the tid. */ static int send_get_tcb(struct adapter *sc, u_int tid) { struct cpl_get_tcb *cpl; struct wrq_cookie cookie; MPASS(tid < sc->tids.ntids); cpl = start_wrq_wr(&sc->sge.ctrlq[0], howmany(sizeof(*cpl), 16), &cookie); if (__predict_false(cpl == NULL)) return (ENOMEM); bzero(cpl, sizeof(*cpl)); INIT_TP_WR(cpl, tid); OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_GET_TCB, tid)); cpl->reply_ctrl = htobe16(V_REPLY_CHAN(0) | V_QUEUENO(sc->sge.ofld_rxq[0].iq.cntxt_id)); cpl->cookie = 0xff; commit_wrq_wr(&sc->sge.ctrlq[0], cpl, &cookie); return (0); } static struct tcb_histent * alloc_tcb_histent(struct adapter *sc, u_int tid, int flags) { struct tcb_histent *te; MPASS(flags == M_NOWAIT || flags == M_WAITOK); te = malloc(sizeof(*te), M_CXGBE, M_ZERO | flags); if (te == NULL) return (NULL); mtx_init(&te->te_lock, "TCB entry", NULL, MTX_DEF); callout_init_mtx(&te->te_callout, &te->te_lock, 0); te->te_adapter = sc; te->te_tid = tid; return (te); } static void free_tcb_histent(struct tcb_histent *te) { mtx_destroy(&te->te_lock); free(te, M_CXGBE); } /* * Start tracking the tid in the TCB history. */ int add_tid_to_history(struct adapter *sc, u_int tid) { struct tcb_histent *te = NULL; struct tom_data *td = sc->tom_softc; int rc; MPASS(tid < sc->tids.ntids); if (td->tcb_history == NULL) return (ENXIO); rw_wlock(&td->tcb_history_lock); if (td->tcb_history[tid] != NULL) { rc = EEXIST; goto done; } te = alloc_tcb_histent(sc, tid, M_NOWAIT); if (te == NULL) { rc = ENOMEM; goto done; } mtx_lock(&te->te_lock); rc = send_get_tcb(sc, tid); if (rc == 0) { te->te_flags |= TE_RPL_PENDING; td->tcb_history[tid] = te; } else { free(te, M_CXGBE); } mtx_unlock(&te->te_lock); done: rw_wunlock(&td->tcb_history_lock); return (rc); } static void remove_tcb_histent(struct tcb_histent *te) { struct adapter *sc = te->te_adapter; struct tom_data *td = sc->tom_softc; rw_assert(&td->tcb_history_lock, RA_WLOCKED); mtx_assert(&te->te_lock, MA_OWNED); MPASS(td->tcb_history[te->te_tid] == te); td->tcb_history[te->te_tid] = NULL; free_tcb_histent(te); rw_wunlock(&td->tcb_history_lock); } static inline struct tcb_histent * lookup_tcb_histent(struct adapter *sc, u_int tid, bool addrem) { struct tcb_histent *te; struct tom_data *td = sc->tom_softc; MPASS(tid < sc->tids.ntids); if (td->tcb_history == NULL) return (NULL); if (addrem) rw_wlock(&td->tcb_history_lock); else rw_rlock(&td->tcb_history_lock); te = td->tcb_history[tid]; if (te != NULL) { mtx_lock(&te->te_lock); return (te); /* with both locks held */ } if (addrem) rw_wunlock(&td->tcb_history_lock); else rw_runlock(&td->tcb_history_lock); return (te); } static inline void release_tcb_histent(struct tcb_histent *te) { struct adapter *sc = te->te_adapter; struct tom_data *td = sc->tom_softc; mtx_assert(&te->te_lock, MA_OWNED); mtx_unlock(&te->te_lock); rw_assert(&td->tcb_history_lock, RA_RLOCKED); rw_runlock(&td->tcb_history_lock); } static void request_tcb(void *arg) { struct tcb_histent *te = arg; mtx_assert(&te->te_lock, MA_OWNED); /* Noone else is supposed to update the histent. */ MPASS(!(te->te_flags & TE_RPL_PENDING)); if (send_get_tcb(te->te_adapter, te->te_tid) == 0) te->te_flags |= TE_RPL_PENDING; else callout_schedule(&te->te_callout, hz / 100); } static void update_tcb_histent(struct tcb_histent *te, const uint64_t *tcb) { struct tom_data *td = te->te_adapter->tom_softc; uint64_t tflags = get_tcb_tflags(tcb); uint8_t sample = 0; if (GET_TCB_FIELD(tcb, SND_MAX_RAW) != GET_TCB_FIELD(tcb, SND_UNA_RAW)) { if (GET_TCB_FIELD(tcb, T_RXTSHIFT) != 0) sample |= TS_RTO; if (GET_TCB_FIELD(tcb, T_DUPACKS) != 0) sample |= TS_DUPACKS; if (GET_TCB_FIELD(tcb, T_DUPACKS) >= td->dupack_threshold) sample |= TS_FASTREXMT; } if (GET_TCB_FIELD(tcb, SND_MAX_RAW) != 0) { uint32_t snd_wnd; sample |= TS_SND_BACKLOGGED; /* for whatever reason. */ snd_wnd = GET_TCB_FIELD(tcb, RCV_ADV); if (tflags & V_TF_RECV_SCALE(1)) snd_wnd <<= GET_TCB_FIELD(tcb, RCV_SCALE); if (GET_TCB_FIELD(tcb, SND_CWND) < snd_wnd) sample |= TS_CWND_LIMITED; /* maybe due to CWND */ } if (tflags & V_TF_CCTRL_ECN(1)) { /* * CE marker on incoming IP hdr, echoing ECE back in the TCP * hdr. Indicates congestion somewhere on the way from the peer * to this node. */ if (tflags & V_TF_CCTRL_ECE(1)) sample |= TS_ECN_ECE; /* * ECE seen and CWR sent (or about to be sent). Might indicate * congestion on the way to the peer. This node is reducing its * congestion window in response. */ if (tflags & (V_TF_CCTRL_CWR(1) | V_TF_CCTRL_RFR(1))) sample |= TS_ECN_CWR; } te->te_sample[te->te_pidx] = sample; if (++te->te_pidx == nitems(te->te_sample)) te->te_pidx = 0; memcpy(te->te_tcb, tcb, TCB_SIZE); te->te_flags |= TE_ACTIVE; } static int do_get_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_get_tcb_rpl *cpl = mtod(m, const void *); const uint64_t *tcb = (const uint64_t *)(const void *)(cpl + 1); struct tcb_histent *te; const u_int tid = GET_TID(cpl); bool remove; remove = GET_TCB_FIELD(tcb, T_STATE) == TCPS_CLOSED; te = lookup_tcb_histent(sc, tid, remove); if (te == NULL) { /* Not in the history. Who issued the GET_TCB for this? */ device_printf(sc->dev, "tcb %u: flags 0x%016jx, state %u, " "srtt %u, sscale %u, rscale %u, cookie 0x%x\n", tid, (uintmax_t)get_tcb_tflags(tcb), GET_TCB_FIELD(tcb, T_STATE), GET_TCB_FIELD(tcb, T_SRTT), GET_TCB_FIELD(tcb, SND_SCALE), GET_TCB_FIELD(tcb, RCV_SCALE), cpl->cookie); goto done; } MPASS(te->te_flags & TE_RPL_PENDING); te->te_flags &= ~TE_RPL_PENDING; if (remove) { remove_tcb_histent(te); } else { update_tcb_histent(te, tcb); callout_reset(&te->te_callout, hz / 10, request_tcb, te); release_tcb_histent(te); } done: m_freem(m); return (0); } static void fill_tcp_info_from_tcb(struct adapter *sc, uint64_t *tcb, struct tcp_info *ti) { uint32_t v; ti->tcpi_state = GET_TCB_FIELD(tcb, T_STATE); v = GET_TCB_FIELD(tcb, T_SRTT); ti->tcpi_rtt = tcp_ticks_to_us(sc, v); v = GET_TCB_FIELD(tcb, T_RTTVAR); ti->tcpi_rttvar = tcp_ticks_to_us(sc, v); ti->tcpi_snd_ssthresh = GET_TCB_FIELD(tcb, SND_SSTHRESH); ti->tcpi_snd_cwnd = GET_TCB_FIELD(tcb, SND_CWND); ti->tcpi_rcv_nxt = GET_TCB_FIELD(tcb, RCV_NXT); v = GET_TCB_FIELD(tcb, TX_MAX); ti->tcpi_snd_nxt = v - GET_TCB_FIELD(tcb, SND_NXT_RAW); /* Receive window being advertised by us. */ ti->tcpi_rcv_wscale = GET_TCB_FIELD(tcb, SND_SCALE); /* Yes, SND. */ ti->tcpi_rcv_space = GET_TCB_FIELD(tcb, RCV_WND); /* Send window */ ti->tcpi_snd_wscale = GET_TCB_FIELD(tcb, RCV_SCALE); /* Yes, RCV. */ ti->tcpi_snd_wnd = GET_TCB_FIELD(tcb, RCV_ADV); if (get_tcb_tflags(tcb) & V_TF_RECV_SCALE(1)) ti->tcpi_snd_wnd <<= ti->tcpi_snd_wscale; else ti->tcpi_snd_wscale = 0; } static void fill_tcp_info_from_history(struct adapter *sc, struct tcb_histent *te, struct tcp_info *ti) { fill_tcp_info_from_tcb(sc, te->te_tcb, ti); } /* * Reads the TCB for the given tid using a memory window and copies it to 'buf' * in the same format as CPL_GET_TCB_RPL. */ static void read_tcb_using_memwin(struct adapter *sc, u_int tid, uint64_t *buf) { int i, j, k, rc; uint32_t addr; u_char *tcb, tmp; MPASS(tid < sc->tids.ntids); addr = t4_read_reg(sc, A_TP_CMM_TCB_BASE) + tid * TCB_SIZE; rc = read_via_memwin(sc, 2, addr, (uint32_t *)buf, TCB_SIZE); if (rc != 0) return; tcb = (u_char *)buf; for (i = 0, j = TCB_SIZE - 16; i < j; i += 16, j -= 16) { for (k = 0; k < 16; k++) { tmp = tcb[i + k]; tcb[i + k] = tcb[j + k]; tcb[j + k] = tmp; } } } static void fill_tcp_info(struct adapter *sc, u_int tid, struct tcp_info *ti) { uint64_t tcb[TCB_SIZE / sizeof(uint64_t)]; struct tcb_histent *te; ti->tcpi_toe_tid = tid; te = lookup_tcb_histent(sc, tid, false); if (te != NULL) { fill_tcp_info_from_history(sc, te, ti); release_tcb_histent(te); } else { if (!(sc->debug_flags & DF_DISABLE_TCB_CACHE)) { /* XXX: tell firmware to flush TCB cache. */ } read_tcb_using_memwin(sc, tid, tcb); fill_tcp_info_from_tcb(sc, tcb, ti); } } /* * Called by the kernel to allow the TOE driver to "refine" values filled up in * the tcp_info for an offloaded connection. */ static void t4_tcp_info(struct toedev *tod, struct tcpcb *tp, struct tcp_info *ti) { struct adapter *sc = tod->tod_softc; struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(tp->t_inpcb); MPASS(ti != NULL); fill_tcp_info(sc, toep->tid, ti); } #ifdef KERN_TLS static int t4_alloc_tls_session(struct toedev *tod, struct tcpcb *tp, struct ktls_session *tls, int direction) { struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(tp->t_inpcb); MPASS(tls != NULL); return (tls_alloc_ktls(toep, tls, direction)); } #endif /* * The TOE driver will not receive any more CPLs for the tid associated with the * toepcb; release the hold on the inpcb. */ void final_cpl_received(struct toepcb *toep) { struct inpcb *inp = toep->inp; KASSERT(inp != NULL, ("%s: inp is NULL", __func__)); INP_WLOCK_ASSERT(inp); KASSERT(toep->flags & TPF_CPL_PENDING, ("%s: CPL not pending already?", __func__)); CTR6(KTR_CXGBE, "%s: tid %d, toep %p (0x%x), inp %p (0x%x)", __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags); if (ulp_mode(toep) == ULP_MODE_TCPDDP) release_ddp_resources(toep); toep->inp = NULL; toep->flags &= ~TPF_CPL_PENDING; mbufq_drain(&toep->ulp_pdu_reclaimq); if (!(toep->flags & TPF_ATTACHED)) release_offload_resources(toep); if (!in_pcbrele_wlocked(inp)) INP_WUNLOCK(inp); } void insert_tid(struct adapter *sc, int tid, void *ctx, int ntids) { struct tid_info *t = &sc->tids; MPASS(tid >= t->tid_base); MPASS(tid - t->tid_base < t->ntids); t->tid_tab[tid - t->tid_base] = ctx; atomic_add_int(&t->tids_in_use, ntids); } void * lookup_tid(struct adapter *sc, int tid) { struct tid_info *t = &sc->tids; return (t->tid_tab[tid - t->tid_base]); } void update_tid(struct adapter *sc, int tid, void *ctx) { struct tid_info *t = &sc->tids; t->tid_tab[tid - t->tid_base] = ctx; } void remove_tid(struct adapter *sc, int tid, int ntids) { struct tid_info *t = &sc->tids; t->tid_tab[tid - t->tid_base] = NULL; atomic_subtract_int(&t->tids_in_use, ntids); } /* * What mtu_idx to use, given a 4-tuple. Note that both s->mss and tcp_mssopt * have the MSS that we should advertise in our SYN. Advertised MSS doesn't * account for any TCP options so the effective MSS (only payload, no headers or * options) could be different. */ static int find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, struct offload_settings *s) { unsigned short *mtus = &sc->params.mtus[0]; int i, mss, mtu; MPASS(inc != NULL); mss = s->mss > 0 ? s->mss : tcp_mssopt(inc); if (inc->inc_flags & INC_ISIPV6) mtu = mss + sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else mtu = mss + sizeof(struct ip) + sizeof(struct tcphdr); for (i = 0; i < NMTUS - 1 && mtus[i + 1] <= mtu; i++) continue; return (i); } /* * Determine the receive window size for a socket. */ u_long select_rcv_wnd(struct socket *so) { unsigned long wnd; SOCKBUF_LOCK_ASSERT(&so->so_rcv); wnd = sbspace(&so->so_rcv); if (wnd < MIN_RCV_WND) wnd = MIN_RCV_WND; return min(wnd, MAX_RCV_WND); } int select_rcv_wscale(void) { int wscale = 0; unsigned long space = sb_max; if (space > MAX_RCV_WND) space = MAX_RCV_WND; while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space) wscale++; return (wscale); } __be64 calc_options0(struct vi_info *vi, struct conn_params *cp) { uint64_t opt0 = 0; opt0 |= F_TCAM_BYPASS; MPASS(cp->wscale >= 0 && cp->wscale <= M_WND_SCALE); opt0 |= V_WND_SCALE(cp->wscale); MPASS(cp->mtu_idx >= 0 && cp->mtu_idx < NMTUS); opt0 |= V_MSS_IDX(cp->mtu_idx); MPASS(cp->ulp_mode >= 0 && cp->ulp_mode <= M_ULP_MODE); opt0 |= V_ULP_MODE(cp->ulp_mode); MPASS(cp->opt0_bufsize >= 0 && cp->opt0_bufsize <= M_RCV_BUFSIZ); opt0 |= V_RCV_BUFSIZ(cp->opt0_bufsize); MPASS(cp->l2t_idx >= 0 && cp->l2t_idx < vi->adapter->vres.l2t.size); opt0 |= V_L2T_IDX(cp->l2t_idx); opt0 |= V_SMAC_SEL(vi->smt_idx); opt0 |= V_TX_CHAN(vi->pi->tx_chan); MPASS(cp->keepalive == 0 || cp->keepalive == 1); opt0 |= V_KEEP_ALIVE(cp->keepalive); MPASS(cp->nagle == 0 || cp->nagle == 1); opt0 |= V_NAGLE(cp->nagle); return (htobe64(opt0)); } __be32 calc_options2(struct vi_info *vi, struct conn_params *cp) { uint32_t opt2 = 0; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; /* * rx flow control, rx coalesce, congestion control, and tx pace are all * explicitly set by the driver. On T5+ the ISS is also set by the * driver to the value picked by the kernel. */ if (is_t4(sc)) { opt2 |= F_RX_FC_VALID | F_RX_COALESCE_VALID; opt2 |= F_CONG_CNTRL_VALID | F_PACE_VALID; } else { opt2 |= F_T5_OPT_2_VALID; /* all 4 valid */ opt2 |= F_T5_ISS; /* ISS provided in CPL */ } MPASS(cp->sack == 0 || cp->sack == 1); opt2 |= V_SACK_EN(cp->sack); MPASS(cp->tstamp == 0 || cp->tstamp == 1); opt2 |= V_TSTAMPS_EN(cp->tstamp); if (cp->wscale > 0) opt2 |= F_WND_SCALE_EN; MPASS(cp->ecn == 0 || cp->ecn == 1); opt2 |= V_CCTRL_ECN(cp->ecn); /* XXX: F_RX_CHANNEL for multiple rx c-chan support goes here. */ opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]); opt2 |= V_PACE(0); opt2 |= F_RSS_QUEUE_VALID; opt2 |= V_RSS_QUEUE(sc->sge.ofld_rxq[cp->rxq_idx].iq.abs_id); MPASS(cp->cong_algo >= 0 && cp->cong_algo <= M_CONG_CNTRL); opt2 |= V_CONG_CNTRL(cp->cong_algo); MPASS(cp->rx_coalesce == 0 || cp->rx_coalesce == 1); if (cp->rx_coalesce == 1) opt2 |= V_RX_COALESCE(M_RX_COALESCE); opt2 |= V_RX_FC_DDP(0) | V_RX_FC_DISABLE(0); #ifdef USE_DDP_RX_FLOW_CONTROL if (cp->ulp_mode == ULP_MODE_TCPDDP) opt2 |= F_RX_FC_DDP; #endif - if (cp->ulp_mode == ULP_MODE_TLS) - opt2 |= F_RX_FC_DISABLE; return (htobe32(opt2)); } uint64_t select_ntuple(struct vi_info *vi, struct l2t_entry *e) { struct adapter *sc = vi->adapter; struct tp_params *tp = &sc->params.tp; uint64_t ntuple = 0; /* * Initialize each of the fields which we care about which are present * in the Compressed Filter Tuple. */ if (tp->vlan_shift >= 0 && EVL_VLANOFTAG(e->vlan) != CPL_L2T_VLAN_NONE) ntuple |= (uint64_t)(F_FT_VLAN_VLD | e->vlan) << tp->vlan_shift; if (tp->port_shift >= 0) ntuple |= (uint64_t)e->lport << tp->port_shift; if (tp->protocol_shift >= 0) ntuple |= (uint64_t)IPPROTO_TCP << tp->protocol_shift; if (tp->vnic_shift >= 0 && tp->ingress_config & F_VNIC) { ntuple |= (uint64_t)(V_FT_VNID_ID_VF(vi->vin) | V_FT_VNID_ID_PF(sc->pf) | V_FT_VNID_ID_VLD(vi->vfvld)) << tp->vnic_shift; } if (is_t4(sc)) return (htobe32((uint32_t)ntuple)); else return (htobe64(V_FILTER_TUPLE(ntuple))); } static int is_tls_sock(struct socket *so, struct adapter *sc) { struct inpcb *inp = sotoinpcb(so); int i, rc; /* XXX: Eventually add a SO_WANT_TLS socket option perhaps? */ rc = 0; ADAPTER_LOCK(sc); for (i = 0; i < sc->tt.num_tls_rx_ports; i++) { if (inp->inp_lport == htons(sc->tt.tls_rx_ports[i]) || inp->inp_fport == htons(sc->tt.tls_rx_ports[i])) { rc = 1; break; } } ADAPTER_UNLOCK(sc); return (rc); } /* * Initialize various connection parameters. */ void init_conn_params(struct vi_info *vi , struct offload_settings *s, struct in_conninfo *inc, struct socket *so, const struct tcp_options *tcpopt, int16_t l2t_idx, struct conn_params *cp) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct tom_tunables *tt = &sc->tt; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); u_long wnd; MPASS(s->offload != 0); /* Congestion control algorithm */ if (s->cong_algo >= 0) cp->cong_algo = s->cong_algo & M_CONG_CNTRL; else if (sc->tt.cong_algorithm >= 0) cp->cong_algo = tt->cong_algorithm & M_CONG_CNTRL; else { struct cc_algo *cc = CC_ALGO(tp); if (strcasecmp(cc->name, "reno") == 0) cp->cong_algo = CONG_ALG_RENO; else if (strcasecmp(cc->name, "tahoe") == 0) cp->cong_algo = CONG_ALG_TAHOE; if (strcasecmp(cc->name, "newreno") == 0) cp->cong_algo = CONG_ALG_NEWRENO; if (strcasecmp(cc->name, "highspeed") == 0) cp->cong_algo = CONG_ALG_HIGHSPEED; else { /* * Use newreno in case the algorithm selected by the * host stack is not supported by the hardware. */ cp->cong_algo = CONG_ALG_NEWRENO; } } /* Tx traffic scheduling class. */ if (s->sched_class >= 0 && s->sched_class < sc->chip_params->nsched_cls) { cp->tc_idx = s->sched_class; } else cp->tc_idx = -1; /* Nagle's algorithm. */ if (s->nagle >= 0) cp->nagle = s->nagle > 0 ? 1 : 0; else cp->nagle = tp->t_flags & TF_NODELAY ? 0 : 1; /* TCP Keepalive. */ if (V_tcp_always_keepalive || so_options_get(so) & SO_KEEPALIVE) cp->keepalive = 1; else cp->keepalive = 0; /* Optimization that's specific to T5 @ 40G. */ if (tt->tx_align >= 0) cp->tx_align = tt->tx_align > 0 ? 1 : 0; else if (chip_id(sc) == CHELSIO_T5 && (port_top_speed(pi) > 10 || sc->params.nports > 2)) cp->tx_align = 1; else cp->tx_align = 0; /* ULP mode. */ if (can_tls_offload(sc) && (s->tls > 0 || (s->tls < 0 && is_tls_sock(so, sc)))) cp->ulp_mode = ULP_MODE_TLS; else if (s->ddp > 0 || (s->ddp < 0 && sc->tt.ddp && (so_options_get(so) & SO_NO_DDP) == 0)) cp->ulp_mode = ULP_MODE_TCPDDP; else cp->ulp_mode = ULP_MODE_NONE; /* Rx coalescing. */ if (s->rx_coalesce >= 0) cp->rx_coalesce = s->rx_coalesce > 0 ? 1 : 0; else if (cp->ulp_mode == ULP_MODE_TLS) cp->rx_coalesce = 0; else if (tt->rx_coalesce >= 0) cp->rx_coalesce = tt->rx_coalesce > 0 ? 1 : 0; else cp->rx_coalesce = 1; /* default */ /* * Index in the PMTU table. This controls the MSS that we announce in * our SYN initially, but after ESTABLISHED it controls the MSS that we * use to send data. */ cp->mtu_idx = find_best_mtu_idx(sc, inc, s); /* Tx queue for this connection. */ if (s->txq >= 0 && s->txq < vi->nofldtxq) cp->txq_idx = s->txq; else cp->txq_idx = arc4random() % vi->nofldtxq; cp->txq_idx += vi->first_ofld_txq; /* Rx queue for this connection. */ if (s->rxq >= 0 && s->rxq < vi->nofldrxq) cp->rxq_idx = s->rxq; else cp->rxq_idx = arc4random() % vi->nofldrxq; cp->rxq_idx += vi->first_ofld_rxq; if (SOLISTENING(so)) { /* Passive open */ MPASS(tcpopt != NULL); /* TCP timestamp option */ if (tcpopt->tstamp && (s->tstamp > 0 || (s->tstamp < 0 && V_tcp_do_rfc1323))) cp->tstamp = 1; else cp->tstamp = 0; /* SACK */ if (tcpopt->sack && (s->sack > 0 || (s->sack < 0 && V_tcp_do_sack))) cp->sack = 1; else cp->sack = 0; /* Receive window scaling. */ if (tcpopt->wsf > 0 && tcpopt->wsf < 15 && V_tcp_do_rfc1323) cp->wscale = select_rcv_wscale(); else cp->wscale = 0; /* ECN */ if (tcpopt->ecn && /* XXX: review. */ (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn))) cp->ecn = 1; else cp->ecn = 0; wnd = max(so->sol_sbrcv_hiwat, MIN_RCV_WND); cp->opt0_bufsize = min(wnd >> 10, M_RCV_BUFSIZ); if (tt->sndbuf > 0) cp->sndbuf = tt->sndbuf; else if (so->sol_sbsnd_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf) cp->sndbuf = 256 * 1024; else cp->sndbuf = so->sol_sbsnd_hiwat; } else { /* Active open */ /* TCP timestamp option */ if (s->tstamp > 0 || (s->tstamp < 0 && (tp->t_flags & TF_REQ_TSTMP))) cp->tstamp = 1; else cp->tstamp = 0; /* SACK */ if (s->sack > 0 || (s->sack < 0 && (tp->t_flags & TF_SACK_PERMIT))) cp->sack = 1; else cp->sack = 0; /* Receive window scaling */ if (tp->t_flags & TF_REQ_SCALE) cp->wscale = select_rcv_wscale(); else cp->wscale = 0; /* ECN */ if (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn == 1)) cp->ecn = 1; else cp->ecn = 0; SOCKBUF_LOCK(&so->so_rcv); wnd = max(select_rcv_wnd(so), MIN_RCV_WND); SOCKBUF_UNLOCK(&so->so_rcv); cp->opt0_bufsize = min(wnd >> 10, M_RCV_BUFSIZ); if (tt->sndbuf > 0) cp->sndbuf = tt->sndbuf; else { SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf) cp->sndbuf = 256 * 1024; else cp->sndbuf = so->so_snd.sb_hiwat; SOCKBUF_UNLOCK(&so->so_snd); } } cp->l2t_idx = l2t_idx; /* This will be initialized on ESTABLISHED. */ cp->emss = 0; } int negative_advice(int status) { return (status == CPL_ERR_RTX_NEG_ADVICE || status == CPL_ERR_PERSIST_NEG_ADVICE || status == CPL_ERR_KEEPALV_NEG_ADVICE); } static int alloc_tid_tab(struct tid_info *t, int flags) { MPASS(t->ntids > 0); MPASS(t->tid_tab == NULL); t->tid_tab = malloc(t->ntids * sizeof(*t->tid_tab), M_CXGBE, M_ZERO | flags); if (t->tid_tab == NULL) return (ENOMEM); atomic_store_rel_int(&t->tids_in_use, 0); return (0); } static void free_tid_tab(struct tid_info *t) { KASSERT(t->tids_in_use == 0, ("%s: %d tids still in use.", __func__, t->tids_in_use)); free(t->tid_tab, M_CXGBE); t->tid_tab = NULL; } static int alloc_stid_tab(struct tid_info *t, int flags) { MPASS(t->nstids > 0); MPASS(t->stid_tab == NULL); t->stid_tab = malloc(t->nstids * sizeof(*t->stid_tab), M_CXGBE, M_ZERO | flags); if (t->stid_tab == NULL) return (ENOMEM); mtx_init(&t->stid_lock, "stid lock", NULL, MTX_DEF); t->stids_in_use = 0; TAILQ_INIT(&t->stids); t->nstids_free_head = t->nstids; return (0); } static void free_stid_tab(struct tid_info *t) { KASSERT(t->stids_in_use == 0, ("%s: %d tids still in use.", __func__, t->stids_in_use)); if (mtx_initialized(&t->stid_lock)) mtx_destroy(&t->stid_lock); free(t->stid_tab, M_CXGBE); t->stid_tab = NULL; } static void free_tid_tabs(struct tid_info *t) { free_tid_tab(t); free_stid_tab(t); } static int alloc_tid_tabs(struct tid_info *t) { int rc; rc = alloc_tid_tab(t, M_NOWAIT); if (rc != 0) goto failed; rc = alloc_stid_tab(t, M_NOWAIT); if (rc != 0) goto failed; return (0); failed: free_tid_tabs(t); return (rc); } static inline void alloc_tcb_history(struct adapter *sc, struct tom_data *td) { if (sc->tids.ntids == 0 || sc->tids.ntids > 1024) return; rw_init(&td->tcb_history_lock, "TCB history"); td->tcb_history = malloc(sc->tids.ntids * sizeof(*td->tcb_history), M_CXGBE, M_ZERO | M_NOWAIT); td->dupack_threshold = G_DUPACKTHRESH(t4_read_reg(sc, A_TP_PARA_REG0)); } static inline void free_tcb_history(struct adapter *sc, struct tom_data *td) { #ifdef INVARIANTS int i; if (td->tcb_history != NULL) { for (i = 0; i < sc->tids.ntids; i++) { MPASS(td->tcb_history[i] == NULL); } } #endif free(td->tcb_history, M_CXGBE); if (rw_initialized(&td->tcb_history_lock)) rw_destroy(&td->tcb_history_lock); } static void free_tom_data(struct adapter *sc, struct tom_data *td) { ASSERT_SYNCHRONIZED_OP(sc); KASSERT(TAILQ_EMPTY(&td->toep_list), ("%s: TOE PCB list is not empty.", __func__)); KASSERT(td->lctx_count == 0, ("%s: lctx hash table is not empty.", __func__)); t4_free_ppod_region(&td->pr); if (td->listen_mask != 0) hashdestroy(td->listen_hash, M_CXGBE, td->listen_mask); if (mtx_initialized(&td->unsent_wr_lock)) mtx_destroy(&td->unsent_wr_lock); if (mtx_initialized(&td->lctx_hash_lock)) mtx_destroy(&td->lctx_hash_lock); if (mtx_initialized(&td->toep_list_lock)) mtx_destroy(&td->toep_list_lock); free_tcb_history(sc, td); free_tid_tabs(&sc->tids); free(td, M_CXGBE); } static char * prepare_pkt(int open_type, uint16_t vtag, struct inpcb *inp, int *pktlen, int *buflen) { char *pkt; struct tcphdr *th; int ipv6, len; const int maxlen = max(sizeof(struct ether_header), sizeof(struct ether_vlan_header)) + max(sizeof(struct ip), sizeof(struct ip6_hdr)) + sizeof(struct tcphdr); MPASS(open_type == OPEN_TYPE_ACTIVE || open_type == OPEN_TYPE_LISTEN); pkt = malloc(maxlen, M_CXGBE, M_ZERO | M_NOWAIT); if (pkt == NULL) return (NULL); ipv6 = inp->inp_vflag & INP_IPV6; len = 0; if (EVL_VLANOFTAG(vtag) == 0xfff) { struct ether_header *eh = (void *)pkt; if (ipv6) eh->ether_type = htons(ETHERTYPE_IPV6); else eh->ether_type = htons(ETHERTYPE_IP); len += sizeof(*eh); } else { struct ether_vlan_header *evh = (void *)pkt; evh->evl_encap_proto = htons(ETHERTYPE_VLAN); evh->evl_tag = htons(vtag); if (ipv6) evh->evl_proto = htons(ETHERTYPE_IPV6); else evh->evl_proto = htons(ETHERTYPE_IP); len += sizeof(*evh); } if (ipv6) { struct ip6_hdr *ip6 = (void *)&pkt[len]; ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_plen = htons(sizeof(struct tcphdr)); ip6->ip6_nxt = IPPROTO_TCP; if (open_type == OPEN_TYPE_ACTIVE) { ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = inp->in6p_faddr; } else if (open_type == OPEN_TYPE_LISTEN) { ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = ip6->ip6_src; } len += sizeof(*ip6); } else { struct ip *ip = (void *)&pkt[len]; ip->ip_v = IPVERSION; ip->ip_hl = sizeof(*ip) >> 2; ip->ip_tos = inp->inp_ip_tos; ip->ip_len = htons(sizeof(struct ip) + sizeof(struct tcphdr)); ip->ip_ttl = inp->inp_ip_ttl; ip->ip_p = IPPROTO_TCP; if (open_type == OPEN_TYPE_ACTIVE) { ip->ip_src = inp->inp_laddr; ip->ip_dst = inp->inp_faddr; } else if (open_type == OPEN_TYPE_LISTEN) { ip->ip_src = inp->inp_laddr; ip->ip_dst = ip->ip_src; } len += sizeof(*ip); } th = (void *)&pkt[len]; if (open_type == OPEN_TYPE_ACTIVE) { th->th_sport = inp->inp_lport; /* network byte order already */ th->th_dport = inp->inp_fport; /* ditto */ } else if (open_type == OPEN_TYPE_LISTEN) { th->th_sport = inp->inp_lport; /* network byte order already */ th->th_dport = th->th_sport; } len += sizeof(th); *pktlen = *buflen = len; return (pkt); } const struct offload_settings * lookup_offload_policy(struct adapter *sc, int open_type, struct mbuf *m, uint16_t vtag, struct inpcb *inp) { const struct t4_offload_policy *op; char *pkt; struct offload_rule *r; int i, matched, pktlen, buflen; static const struct offload_settings allow_offloading_settings = { .offload = 1, .rx_coalesce = -1, .cong_algo = -1, .sched_class = -1, .tstamp = -1, .sack = -1, .nagle = -1, .ecn = -1, .ddp = -1, .tls = -1, .txq = -1, .rxq = -1, .mss = -1, }; static const struct offload_settings disallow_offloading_settings = { .offload = 0, /* rest is irrelevant when offload is off. */ }; rw_assert(&sc->policy_lock, RA_LOCKED); /* * If there's no Connection Offloading Policy attached to the device * then we need to return a default static policy. If * "cop_managed_offloading" is true, then we need to disallow * offloading until a COP is attached to the device. Otherwise we * allow offloading ... */ op = sc->policy; if (op == NULL) { if (sc->tt.cop_managed_offloading) return (&disallow_offloading_settings); else return (&allow_offloading_settings); } switch (open_type) { case OPEN_TYPE_ACTIVE: case OPEN_TYPE_LISTEN: pkt = prepare_pkt(open_type, vtag, inp, &pktlen, &buflen); break; case OPEN_TYPE_PASSIVE: MPASS(m != NULL); pkt = mtod(m, char *); MPASS(*pkt == CPL_PASS_ACCEPT_REQ); pkt += sizeof(struct cpl_pass_accept_req); pktlen = m->m_pkthdr.len - sizeof(struct cpl_pass_accept_req); buflen = m->m_len - sizeof(struct cpl_pass_accept_req); break; default: MPASS(0); return (&disallow_offloading_settings); } if (pkt == NULL || pktlen == 0 || buflen == 0) return (&disallow_offloading_settings); matched = 0; r = &op->rule[0]; for (i = 0; i < op->nrules; i++, r++) { if (r->open_type != open_type && r->open_type != OPEN_TYPE_DONTCARE) { continue; } matched = bpf_filter(r->bpf_prog.bf_insns, pkt, pktlen, buflen); if (matched) break; } if (open_type == OPEN_TYPE_ACTIVE || open_type == OPEN_TYPE_LISTEN) free(pkt, M_CXGBE); return (matched ? &r->settings : &disallow_offloading_settings); } static void reclaim_wr_resources(void *arg, int count) { struct tom_data *td = arg; STAILQ_HEAD(, wrqe) twr_list = STAILQ_HEAD_INITIALIZER(twr_list); struct cpl_act_open_req *cpl; u_int opcode, atid, tid; struct wrqe *wr; struct adapter *sc = td_adapter(td); mtx_lock(&td->unsent_wr_lock); STAILQ_SWAP(&td->unsent_wr_list, &twr_list, wrqe); mtx_unlock(&td->unsent_wr_lock); while ((wr = STAILQ_FIRST(&twr_list)) != NULL) { STAILQ_REMOVE_HEAD(&twr_list, link); cpl = wrtod(wr); opcode = GET_OPCODE(cpl); switch (opcode) { case CPL_ACT_OPEN_REQ: case CPL_ACT_OPEN_REQ6: atid = G_TID_TID(be32toh(OPCODE_TID(cpl))); CTR2(KTR_CXGBE, "%s: atid %u ", __func__, atid); act_open_failure_cleanup(sc, atid, EHOSTUNREACH); free(wr, M_CXGBE); break; case CPL_PASS_ACCEPT_RPL: tid = GET_TID(cpl); CTR2(KTR_CXGBE, "%s: tid %u ", __func__, tid); synack_failure_cleanup(sc, tid); free(wr, M_CXGBE); break; default: log(LOG_ERR, "%s: leaked work request %p, wr_len %d, " "opcode %x\n", __func__, wr, wr->wr_len, opcode); /* WR not freed here; go look at it with a debugger. */ } } } /* * Ground control to Major TOM * Commencing countdown, engines on */ static int t4_tom_activate(struct adapter *sc) { struct tom_data *td; struct toedev *tod; struct vi_info *vi; int i, rc, v; ASSERT_SYNCHRONIZED_OP(sc); /* per-adapter softc for TOM */ td = malloc(sizeof(*td), M_CXGBE, M_ZERO | M_NOWAIT); if (td == NULL) return (ENOMEM); /* List of TOE PCBs and associated lock */ mtx_init(&td->toep_list_lock, "PCB list lock", NULL, MTX_DEF); TAILQ_INIT(&td->toep_list); /* Listen context */ mtx_init(&td->lctx_hash_lock, "lctx hash lock", NULL, MTX_DEF); td->listen_hash = hashinit_flags(LISTEN_HASH_SIZE, M_CXGBE, &td->listen_mask, HASH_NOWAIT); /* List of WRs for which L2 resolution failed */ mtx_init(&td->unsent_wr_lock, "Unsent WR list lock", NULL, MTX_DEF); STAILQ_INIT(&td->unsent_wr_list); TASK_INIT(&td->reclaim_wr_resources, 0, reclaim_wr_resources, td); /* TID tables */ rc = alloc_tid_tabs(&sc->tids); if (rc != 0) goto done; rc = t4_init_ppod_region(&td->pr, &sc->vres.ddp, t4_read_reg(sc, A_ULP_RX_TDDP_PSZ), "TDDP page pods"); if (rc != 0) goto done; t4_set_reg_field(sc, A_ULP_RX_TDDP_TAGMASK, V_TDDPTAGMASK(M_TDDPTAGMASK), td->pr.pr_tag_mask); alloc_tcb_history(sc, td); /* toedev ops */ tod = &td->tod; init_toedev(tod); tod->tod_softc = sc; tod->tod_connect = t4_connect; tod->tod_listen_start = t4_listen_start; tod->tod_listen_stop = t4_listen_stop; tod->tod_rcvd = t4_rcvd; tod->tod_output = t4_tod_output; tod->tod_send_rst = t4_send_rst; tod->tod_send_fin = t4_send_fin; tod->tod_pcb_detach = t4_pcb_detach; tod->tod_l2_update = t4_l2_update; tod->tod_syncache_added = t4_syncache_added; tod->tod_syncache_removed = t4_syncache_removed; tod->tod_syncache_respond = t4_syncache_respond; tod->tod_offload_socket = t4_offload_socket; tod->tod_ctloutput = t4_ctloutput; tod->tod_tcp_info = t4_tcp_info; #ifdef KERN_TLS tod->tod_alloc_tls_session = t4_alloc_tls_session; #endif for_each_port(sc, i) { for_each_vi(sc->port[i], v, vi) { TOEDEV(vi->ifp) = &td->tod; } } sc->tom_softc = td; register_toedev(sc->tom_softc); done: if (rc != 0) free_tom_data(sc, td); return (rc); } static int t4_tom_deactivate(struct adapter *sc) { int rc = 0; struct tom_data *td = sc->tom_softc; ASSERT_SYNCHRONIZED_OP(sc); if (td == NULL) return (0); /* XXX. KASSERT? */ if (sc->offload_map != 0) return (EBUSY); /* at least one port has IFCAP_TOE enabled */ if (uld_active(sc, ULD_IWARP) || uld_active(sc, ULD_ISCSI)) return (EBUSY); /* both iWARP and iSCSI rely on the TOE. */ mtx_lock(&td->toep_list_lock); if (!TAILQ_EMPTY(&td->toep_list)) rc = EBUSY; mtx_unlock(&td->toep_list_lock); mtx_lock(&td->lctx_hash_lock); if (td->lctx_count > 0) rc = EBUSY; mtx_unlock(&td->lctx_hash_lock); taskqueue_drain(taskqueue_thread, &td->reclaim_wr_resources); mtx_lock(&td->unsent_wr_lock); if (!STAILQ_EMPTY(&td->unsent_wr_list)) rc = EBUSY; mtx_unlock(&td->unsent_wr_lock); if (rc == 0) { unregister_toedev(sc->tom_softc); free_tom_data(sc, td); sc->tom_softc = NULL; } return (rc); } static int t4_aio_queue_tom(struct socket *so, struct kaiocb *job) { struct tcpcb *tp = so_sototcpcb(so); struct toepcb *toep = tp->t_toe; int error; if (ulp_mode(toep) == ULP_MODE_TCPDDP) { error = t4_aio_queue_ddp(so, job); if (error != EOPNOTSUPP) return (error); } return (t4_aio_queue_aiotx(so, job)); } static int t4_ctloutput_tom(struct socket *so, struct sockopt *sopt) { if (sopt->sopt_level != IPPROTO_TCP) return (tcp_ctloutput(so, sopt)); switch (sopt->sopt_name) { case TCP_TLSOM_SET_TLS_CONTEXT: case TCP_TLSOM_GET_TLS_TOM: case TCP_TLSOM_CLR_TLS_TOM: case TCP_TLSOM_CLR_QUIES: return (t4_ctloutput_tls(so, sopt)); default: return (tcp_ctloutput(so, sopt)); } } static int t4_tom_mod_load(void) { struct protosw *tcp_protosw, *tcp6_protosw; /* CPL handlers */ t4_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl); t4_register_shared_cpl_handler(CPL_L2T_WRITE_RPL, do_l2t_write_rpl2, CPL_COOKIE_TOM); t4_init_connect_cpl_handlers(); t4_init_listen_cpl_handlers(); t4_init_cpl_io_handlers(); t4_ddp_mod_load(); t4_tls_mod_load(); tcp_protosw = pffindproto(PF_INET, IPPROTO_TCP, SOCK_STREAM); if (tcp_protosw == NULL) return (ENOPROTOOPT); bcopy(tcp_protosw, &toe_protosw, sizeof(toe_protosw)); bcopy(tcp_protosw->pr_usrreqs, &toe_usrreqs, sizeof(toe_usrreqs)); toe_usrreqs.pru_aio_queue = t4_aio_queue_tom; toe_protosw.pr_ctloutput = t4_ctloutput_tom; toe_protosw.pr_usrreqs = &toe_usrreqs; tcp6_protosw = pffindproto(PF_INET6, IPPROTO_TCP, SOCK_STREAM); if (tcp6_protosw == NULL) return (ENOPROTOOPT); bcopy(tcp6_protosw, &toe6_protosw, sizeof(toe6_protosw)); bcopy(tcp6_protosw->pr_usrreqs, &toe6_usrreqs, sizeof(toe6_usrreqs)); toe6_usrreqs.pru_aio_queue = t4_aio_queue_tom; toe6_protosw.pr_ctloutput = t4_ctloutput_tom; toe6_protosw.pr_usrreqs = &toe6_usrreqs; return (t4_register_uld(&tom_uld_info)); } static void tom_uninit(struct adapter *sc, void *arg __unused) { if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4tomun")) return; /* Try to free resources (works only if no port has IFCAP_TOE) */ if (uld_active(sc, ULD_TOM)) t4_deactivate_uld(sc, ULD_TOM); end_synchronized_op(sc, 0); } static int t4_tom_mod_unload(void) { t4_iterate(tom_uninit, NULL); if (t4_unregister_uld(&tom_uld_info) == EBUSY) return (EBUSY); t4_tls_mod_unload(); t4_ddp_mod_unload(); t4_uninit_connect_cpl_handlers(); t4_uninit_listen_cpl_handlers(); t4_uninit_cpl_io_handlers(); t4_register_shared_cpl_handler(CPL_L2T_WRITE_RPL, NULL, CPL_COOKIE_TOM); t4_register_cpl_handler(CPL_GET_TCB_RPL, NULL); return (0); } #endif /* TCP_OFFLOAD */ static int t4_tom_modevent(module_t mod, int cmd, void *arg) { int rc = 0; #ifdef TCP_OFFLOAD switch (cmd) { case MOD_LOAD: rc = t4_tom_mod_load(); break; case MOD_UNLOAD: rc = t4_tom_mod_unload(); break; default: rc = EINVAL; } #else printf("t4_tom: compiled without TCP_OFFLOAD support.\n"); rc = EOPNOTSUPP; #endif return (rc); } static moduledata_t t4_tom_moddata= { "t4_tom", t4_tom_modevent, 0 }; MODULE_VERSION(t4_tom, 1); MODULE_DEPEND(t4_tom, toecore, 1, 1, 1); MODULE_DEPEND(t4_tom, t4nex, 1, 1, 1); DECLARE_MODULE(t4_tom, t4_tom_moddata, SI_SUB_EXEC, SI_ORDER_ANY);