diff --git a/sys/dev/cxgbe/cxgbei/icl_cxgbei.c b/sys/dev/cxgbe/cxgbei/icl_cxgbei.c index a57d26ae21b8..5526388915f7 100644 --- a/sys/dev/cxgbe/cxgbei/icl_cxgbei.c +++ b/sys/dev/cxgbe/cxgbei/icl_cxgbei.c @@ -1,1427 +1,1441 @@ /*- * Copyright (c) 2012 The FreeBSD Foundation * Copyright (c) 2015 Chelsio Communications, Inc. * All rights reserved. * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * cxgbei implementation of iSCSI Common Layer kobj(9) interface. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #ifdef TCP_OFFLOAD #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common/common.h" #include "common/t4_tcb.h" #include "tom/t4_tom.h" #include "cxgbei.h" /* * Use the page pod tag for the TT hash. */ #define TT_HASH(icc, tt) (G_PPOD_TAG(tt) & (icc)->cmp_hash_mask) struct cxgbei_ddp_state { struct ppod_reservation prsv; struct cxgbei_cmp cmp; }; static MALLOC_DEFINE(M_CXGBEI, "cxgbei", "cxgbei(4)"); SYSCTL_NODE(_kern_icl, OID_AUTO, cxgbei, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "Chelsio iSCSI offload"); static int first_burst_length = 8192; SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, first_burst_length, CTLFLAG_RWTUN, &first_burst_length, 0, "First burst length"); static int max_burst_length = 2 * 1024 * 1024; SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, max_burst_length, CTLFLAG_RWTUN, &max_burst_length, 0, "Maximum burst length"); static int sendspace = 1048576; SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, sendspace, CTLFLAG_RWTUN, &sendspace, 0, "Default send socket buffer size"); static int recvspace = 1048576; SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, recvspace, CTLFLAG_RWTUN, &recvspace, 0, "Default receive socket buffer size"); static volatile u_int icl_cxgbei_ncons; #define ICL_CONN_LOCK(X) mtx_lock(X->ic_lock) #define ICL_CONN_UNLOCK(X) mtx_unlock(X->ic_lock) #define ICL_CONN_LOCK_ASSERT(X) mtx_assert(X->ic_lock, MA_OWNED) #define ICL_CONN_LOCK_ASSERT_NOT(X) mtx_assert(X->ic_lock, MA_NOTOWNED) static icl_conn_new_pdu_t icl_cxgbei_conn_new_pdu; static icl_conn_pdu_data_segment_length_t icl_cxgbei_conn_pdu_data_segment_length; static icl_conn_pdu_append_data_t icl_cxgbei_conn_pdu_append_data; static icl_conn_pdu_get_data_t icl_cxgbei_conn_pdu_get_data; static icl_conn_pdu_queue_t icl_cxgbei_conn_pdu_queue; static icl_conn_pdu_queue_cb_t icl_cxgbei_conn_pdu_queue_cb; static icl_conn_handoff_t icl_cxgbei_conn_handoff; static icl_conn_free_t icl_cxgbei_conn_free; static icl_conn_close_t icl_cxgbei_conn_close; static icl_conn_task_setup_t icl_cxgbei_conn_task_setup; static icl_conn_task_done_t icl_cxgbei_conn_task_done; static icl_conn_transfer_setup_t icl_cxgbei_conn_transfer_setup; static icl_conn_transfer_done_t icl_cxgbei_conn_transfer_done; static kobj_method_t icl_cxgbei_methods[] = { KOBJMETHOD(icl_conn_new_pdu, icl_cxgbei_conn_new_pdu), KOBJMETHOD(icl_conn_pdu_free, icl_cxgbei_conn_pdu_free), KOBJMETHOD(icl_conn_pdu_data_segment_length, icl_cxgbei_conn_pdu_data_segment_length), KOBJMETHOD(icl_conn_pdu_append_data, icl_cxgbei_conn_pdu_append_data), KOBJMETHOD(icl_conn_pdu_get_data, icl_cxgbei_conn_pdu_get_data), KOBJMETHOD(icl_conn_pdu_queue, icl_cxgbei_conn_pdu_queue), KOBJMETHOD(icl_conn_pdu_queue_cb, icl_cxgbei_conn_pdu_queue_cb), KOBJMETHOD(icl_conn_handoff, icl_cxgbei_conn_handoff), KOBJMETHOD(icl_conn_free, icl_cxgbei_conn_free), KOBJMETHOD(icl_conn_close, icl_cxgbei_conn_close), KOBJMETHOD(icl_conn_task_setup, icl_cxgbei_conn_task_setup), KOBJMETHOD(icl_conn_task_done, icl_cxgbei_conn_task_done), KOBJMETHOD(icl_conn_transfer_setup, icl_cxgbei_conn_transfer_setup), KOBJMETHOD(icl_conn_transfer_done, icl_cxgbei_conn_transfer_done), { 0, 0 } }; DEFINE_CLASS(icl_cxgbei, icl_cxgbei_methods, sizeof(struct icl_cxgbei_conn)); void icl_cxgbei_conn_pdu_free(struct icl_conn *ic, struct icl_pdu *ip) { struct icl_cxgbei_pdu *icp = ip_to_icp(ip); KASSERT(icp->ref_cnt != 0, ("freeing deleted PDU")); MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE); MPASS(ic == ip->ip_conn); m_freem(ip->ip_ahs_mbuf); m_freem(ip->ip_data_mbuf); m_freem(ip->ip_bhs_mbuf); KASSERT(ic != NULL || icp->ref_cnt == 1, ("orphaned PDU has oustanding references")); if (atomic_fetchadd_int(&icp->ref_cnt, -1) != 1) return; free(icp, M_CXGBEI); #ifdef DIAGNOSTIC if (__predict_true(ic != NULL)) refcount_release(&ic->ic_outstanding_pdus); #endif } static void icl_cxgbei_pdu_call_cb(struct icl_pdu *ip) { struct icl_cxgbei_pdu *icp = ip_to_icp(ip); MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE); if (icp->cb != NULL) icp->cb(ip, icp->error); #ifdef DIAGNOSTIC if (__predict_true(ip->ip_conn != NULL)) refcount_release(&ip->ip_conn->ic_outstanding_pdus); #endif free(icp, M_CXGBEI); } static void icl_cxgbei_pdu_done(struct icl_pdu *ip, int error) { struct icl_cxgbei_pdu *icp = ip_to_icp(ip); if (error != 0) icp->error = error; m_freem(ip->ip_ahs_mbuf); ip->ip_ahs_mbuf = NULL; m_freem(ip->ip_data_mbuf); ip->ip_data_mbuf = NULL; m_freem(ip->ip_bhs_mbuf); ip->ip_bhs_mbuf = NULL; /* * All other references to this PDU should have been dropped * by the m_freem() of ip_data_mbuf. */ if (atomic_fetchadd_int(&icp->ref_cnt, -1) == 1) icl_cxgbei_pdu_call_cb(ip); else __assert_unreachable(); } static void icl_cxgbei_mbuf_done(struct mbuf *mb) { struct icl_cxgbei_pdu *icp = (struct icl_cxgbei_pdu *)mb->m_ext.ext_arg1; /* * NB: mb_free_mext() might leave ref_cnt as 1 without * decrementing it if it hits the fast path in the ref_cnt * check. */ icl_cxgbei_pdu_call_cb(&icp->ip); } struct icl_pdu * icl_cxgbei_new_pdu(int flags) { struct icl_cxgbei_pdu *icp; struct icl_pdu *ip; struct mbuf *m; icp = malloc(sizeof(*icp), M_CXGBEI, flags | M_ZERO); if (__predict_false(icp == NULL)) return (NULL); icp->icp_signature = CXGBEI_PDU_SIGNATURE; icp->ref_cnt = 1; ip = &icp->ip; m = m_gethdr(flags, MT_DATA); if (__predict_false(m == NULL)) { free(icp, M_CXGBEI); return (NULL); } ip->ip_bhs_mbuf = m; ip->ip_bhs = mtod(m, struct iscsi_bhs *); memset(ip->ip_bhs, 0, sizeof(*ip->ip_bhs)); m->m_len = sizeof(struct iscsi_bhs); m->m_pkthdr.len = m->m_len; return (ip); } void icl_cxgbei_new_pdu_set_conn(struct icl_pdu *ip, struct icl_conn *ic) { ip->ip_conn = ic; #ifdef DIAGNOSTIC refcount_acquire(&ic->ic_outstanding_pdus); #endif } /* * Allocate icl_pdu with empty BHS to fill up by the caller. */ static struct icl_pdu * icl_cxgbei_conn_new_pdu(struct icl_conn *ic, int flags) { struct icl_pdu *ip; ip = icl_cxgbei_new_pdu(flags); if (__predict_false(ip == NULL)) return (NULL); icl_cxgbei_new_pdu_set_conn(ip, ic); return (ip); } static size_t icl_pdu_data_segment_length(const struct icl_pdu *request) { uint32_t len = 0; len += request->ip_bhs->bhs_data_segment_len[0]; len <<= 8; len += request->ip_bhs->bhs_data_segment_len[1]; len <<= 8; len += request->ip_bhs->bhs_data_segment_len[2]; return (len); } size_t icl_cxgbei_conn_pdu_data_segment_length(struct icl_conn *ic, const struct icl_pdu *request) { return (icl_pdu_data_segment_length(request)); } static struct mbuf * finalize_pdu(struct icl_cxgbei_conn *icc, struct icl_cxgbei_pdu *icp) { struct icl_pdu *ip = &icp->ip; uint8_t ulp_submode, padding; struct mbuf *m, *last; struct iscsi_bhs *bhs; int data_len; /* * Fix up the data segment mbuf first. */ m = ip->ip_data_mbuf; ulp_submode = icc->ulp_submode; if (m != NULL) { last = m_last(m); /* * Round up the data segment to a 4B boundary. Pad with 0 if * necessary. There will definitely be room in the mbuf. */ padding = roundup2(ip->ip_data_len, 4) - ip->ip_data_len; if (padding != 0) { MPASS(padding <= M_TRAILINGSPACE(last)); bzero(mtod(last, uint8_t *) + last->m_len, padding); last->m_len += padding; } } else { MPASS(ip->ip_data_len == 0); ulp_submode &= ~ULP_CRC_DATA; padding = 0; } /* * Now the header mbuf that has the BHS. */ m = ip->ip_bhs_mbuf; MPASS(m->m_pkthdr.len == sizeof(struct iscsi_bhs)); MPASS(m->m_len == sizeof(struct iscsi_bhs)); bhs = ip->ip_bhs; data_len = ip->ip_data_len; if (data_len > icc->ic.ic_max_send_data_segment_length) { struct iscsi_bhs_data_in *bhsdi; int flags; KASSERT(padding == 0, ("%s: ISO with padding %d for icp %p", __func__, padding, icp)); switch (bhs->bhs_opcode) { case ISCSI_BHS_OPCODE_SCSI_DATA_OUT: flags = 1; break; case ISCSI_BHS_OPCODE_SCSI_DATA_IN: flags = 2; break; default: panic("invalid opcode %#x for ISO", bhs->bhs_opcode); } data_len = icc->ic.ic_max_send_data_segment_length; bhsdi = (struct iscsi_bhs_data_in *)bhs; if (bhsdi->bhsdi_flags & BHSDI_FLAGS_F) { /* * Firmware will set F on the final PDU in the * burst. */ flags |= CXGBE_ISO_F; bhsdi->bhsdi_flags &= ~BHSDI_FLAGS_F; } set_mbuf_iscsi_iso(m, true); set_mbuf_iscsi_iso_flags(m, flags); set_mbuf_iscsi_iso_mss(m, data_len); } bhs->bhs_data_segment_len[2] = data_len; bhs->bhs_data_segment_len[1] = data_len >> 8; bhs->bhs_data_segment_len[0] = data_len >> 16; /* * Extract mbuf chain from PDU. */ m->m_pkthdr.len += ip->ip_data_len + padding; m->m_next = ip->ip_data_mbuf; set_mbuf_ulp_submode(m, ulp_submode); ip->ip_bhs_mbuf = NULL; ip->ip_data_mbuf = NULL; ip->ip_bhs = NULL; /* * Drop PDU reference on icp. Additional references might * still be held by zero-copy PDU buffers (ICL_NOCOPY). */ if (atomic_fetchadd_int(&icp->ref_cnt, -1) == 1) icl_cxgbei_pdu_call_cb(ip); return (m); } int icl_cxgbei_conn_pdu_append_data(struct icl_conn *ic, struct icl_pdu *ip, const void *addr, size_t len, int flags) { struct icl_cxgbei_pdu *icp = ip_to_icp(ip); struct mbuf *m, *m_tail; const char *src; MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE); MPASS(ic == ip->ip_conn); KASSERT(len > 0, ("%s: len is %jd", __func__, (intmax_t)len)); m_tail = ip->ip_data_mbuf; if (m_tail != NULL) for (; m_tail->m_next != NULL; m_tail = m_tail->m_next) ; if (flags & ICL_NOCOPY) { m = m_get(flags & ~ICL_NOCOPY, MT_DATA); if (m == NULL) { ICL_WARN("failed to allocate mbuf"); return (ENOMEM); } m->m_flags |= M_RDONLY; m_extaddref(m, __DECONST(char *, addr), len, &icp->ref_cnt, icl_cxgbei_mbuf_done, icp, NULL); m->m_len = len; if (ip->ip_data_mbuf == NULL) { ip->ip_data_mbuf = m; ip->ip_data_len = len; } else { m_tail->m_next = m; m_tail = m_tail->m_next; ip->ip_data_len += len; } return (0); } src = (const char *)addr; /* Allocate as jumbo mbufs of size MJUM16BYTES. */ while (len >= MJUM16BYTES) { m = m_getjcl(M_NOWAIT, MT_DATA, 0, MJUM16BYTES); if (__predict_false(m == NULL)) { if ((flags & M_WAITOK) != 0) { /* Fall back to non-jumbo mbufs. */ break; } return (ENOMEM); } memcpy(mtod(m, void *), src, MJUM16BYTES); m->m_len = MJUM16BYTES; if (ip->ip_data_mbuf == NULL) { ip->ip_data_mbuf = m_tail = m; ip->ip_data_len = MJUM16BYTES; } else { m_tail->m_next = m; m_tail = m_tail->m_next; ip->ip_data_len += MJUM16BYTES; } src += MJUM16BYTES; len -= MJUM16BYTES; } /* Allocate mbuf chain for the remaining data. */ if (len != 0) { m = m_getm2(NULL, len, flags, MT_DATA, 0); if (__predict_false(m == NULL)) return (ENOMEM); if (ip->ip_data_mbuf == NULL) { ip->ip_data_mbuf = m; ip->ip_data_len = len; } else { m_tail->m_next = m; ip->ip_data_len += len; } for (; m != NULL; m = m->m_next) { m->m_len = min(len, M_SIZE(m)); memcpy(mtod(m, void *), src, m->m_len); src += m->m_len; len -= m->m_len; } MPASS(len == 0); } MPASS(ip->ip_data_len <= max(ic->ic_max_send_data_segment_length, ic->ic_hw_isomax)); return (0); } void icl_cxgbei_conn_pdu_get_data(struct icl_conn *ic, struct icl_pdu *ip, size_t off, void *addr, size_t len) { struct icl_cxgbei_pdu *icp = ip_to_icp(ip); if (icp->icp_flags & ICPF_RX_DDP) return; /* data is DDP'ed, no need to copy */ m_copydata(ip->ip_data_mbuf, off, len, addr); } void icl_cxgbei_conn_pdu_queue(struct icl_conn *ic, struct icl_pdu *ip) { icl_cxgbei_conn_pdu_queue_cb(ic, ip, NULL); } void icl_cxgbei_conn_pdu_queue_cb(struct icl_conn *ic, struct icl_pdu *ip, icl_pdu_cb cb) { struct epoch_tracker et; struct icl_cxgbei_conn *icc = ic_to_icc(ic); struct icl_cxgbei_pdu *icp = ip_to_icp(ip); struct socket *so = ic->ic_socket; struct toepcb *toep = icc->toep; struct inpcb *inp; struct mbuf *m; MPASS(ic == ip->ip_conn); MPASS(ip->ip_bhs_mbuf != NULL); /* The kernel doesn't generate PDUs with AHS. */ MPASS(ip->ip_ahs_mbuf == NULL && ip->ip_ahs_len == 0); ICL_CONN_LOCK_ASSERT(ic); icp->cb = cb; /* NOTE: sowriteable without so_snd lock is a mostly harmless race. */ if (ic->ic_disconnecting || so == NULL || !sowriteable(so)) { icl_cxgbei_pdu_done(ip, ENOTCONN); return; } m = finalize_pdu(icc, icp); M_ASSERTPKTHDR(m); MPASS((m->m_pkthdr.len & 3) == 0); /* * Do not get inp from toep->inp as the toepcb might have detached * already. */ inp = sotoinpcb(so); CURVNET_SET(toep->vnet); NET_EPOCH_ENTER(et); INP_WLOCK(inp); if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) || __predict_false((toep->flags & TPF_ATTACHED) == 0)) m_freem(m); else { mbufq_enqueue(&toep->ulp_pduq, m); t4_push_pdus(icc->sc, toep, 0); } INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); } static struct icl_conn * icl_cxgbei_new_conn(const char *name, struct mtx *lock) { struct icl_cxgbei_conn *icc; struct icl_conn *ic; refcount_acquire(&icl_cxgbei_ncons); icc = (struct icl_cxgbei_conn *)kobj_create(&icl_cxgbei_class, M_CXGBE, M_WAITOK | M_ZERO); icc->icc_signature = CXGBEI_CONN_SIGNATURE; STAILQ_INIT(&icc->rcvd_pdus); icc->cmp_table = hashinit(64, M_CXGBEI, &icc->cmp_hash_mask); mtx_init(&icc->cmp_lock, "cxgbei_cmp", NULL, MTX_DEF); ic = &icc->ic; ic->ic_lock = lock; #ifdef DIAGNOSTIC refcount_init(&ic->ic_outstanding_pdus, 0); #endif ic->ic_name = name; ic->ic_offload = "cxgbei"; ic->ic_unmapped = false; CTR2(KTR_CXGBE, "%s: icc %p", __func__, icc); return (ic); } void icl_cxgbei_conn_free(struct icl_conn *ic) { struct icl_cxgbei_conn *icc = ic_to_icc(ic); MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); CTR2(KTR_CXGBE, "%s: icc %p", __func__, icc); mtx_destroy(&icc->cmp_lock); hashdestroy(icc->cmp_table, M_CXGBEI, icc->cmp_hash_mask); kobj_delete((struct kobj *)icc, M_CXGBE); refcount_release(&icl_cxgbei_ncons); } static int icl_cxgbei_setsockopt(struct icl_conn *ic, struct socket *so, int sspace, int rspace) { struct sockopt opt; int error, one = 1, ss, rs; ss = max(sendspace, sspace); rs = max(recvspace, rspace); error = soreserve(so, ss, rs); if (error != 0) { icl_cxgbei_conn_close(ic); return (error); } SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_flags |= SB_AUTOSIZE; SOCKBUF_UNLOCK(&so->so_snd); SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_flags |= SB_AUTOSIZE; SOCKBUF_UNLOCK(&so->so_rcv); /* * Disable Nagle. */ bzero(&opt, sizeof(opt)); opt.sopt_dir = SOPT_SET; opt.sopt_level = IPPROTO_TCP; opt.sopt_name = TCP_NODELAY; opt.sopt_val = &one; opt.sopt_valsize = sizeof(one); error = sosetopt(so, &opt); if (error != 0) { icl_cxgbei_conn_close(ic); return (error); } return (0); } /* * Request/response structure used to find out the adapter offloading a socket. */ struct find_ofld_adapter_rr { struct socket *so; struct adapter *sc; /* result */ }; static void find_offload_adapter(struct adapter *sc, void *arg) { struct find_ofld_adapter_rr *fa = arg; struct socket *so = fa->so; struct tom_data *td = sc->tom_softc; struct tcpcb *tp; struct inpcb *inp; /* Non-TCP were filtered out earlier. */ MPASS(so->so_proto->pr_protocol == IPPROTO_TCP); if (fa->sc != NULL) return; /* Found already. */ if (td == NULL) return; /* TOE not enabled on this adapter. */ inp = sotoinpcb(so); INP_WLOCK(inp); if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { tp = intotcpcb(inp); if (tp->t_flags & TF_TOE && tp->tod == &td->tod) fa->sc = sc; /* Found. */ } INP_WUNLOCK(inp); } /* XXXNP: move this to t4_tom. */ static void send_iscsi_flowc_wr(struct adapter *sc, struct toepcb *toep, int maxlen) { struct wrqe *wr; struct fw_flowc_wr *flowc; const u_int nparams = 1; u_int flowclen; struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); wr = alloc_wrqe(roundup2(flowclen, 16), &toep->ofld_txq->wrq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } flowc = wrtod(wr); memset(flowc, 0, wr->wr_len); flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | V_FW_FLOWC_WR_NPARAMS(nparams)); flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | V_FW_WR_FLOWID(toep->tid)); flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_TXDATAPLEN_MAX; flowc->mnemval[0].val = htobe32(maxlen); txsd->tx_credits = howmany(flowclen, 16); txsd->plen = 0; KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, ("%s: not enough credits (%d)", __func__, toep->tx_credits)); toep->tx_credits -= txsd->tx_credits; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) toep->txsd_pidx = 0; toep->txsd_avail--; t4_wrq_tx(sc, wr); } static void set_ulp_mode_iscsi(struct adapter *sc, struct toepcb *toep, u_int ulp_submode) { uint64_t val; CTR3(KTR_CXGBE, "%s: tid %u, ULP_MODE_ISCSI, submode=%#x", __func__, toep->tid, ulp_submode); val = V_TCB_ULP_TYPE(ULP_MODE_ISCSI) | V_TCB_ULP_RAW(ulp_submode); t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_ULP_TYPE, V_TCB_ULP_TYPE(M_TCB_ULP_TYPE) | V_TCB_ULP_RAW(M_TCB_ULP_RAW), val, 0, 0); val = V_TF_RX_FLOW_CONTROL_DISABLE(1ULL); t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS, val, val, 0, 0); } /* * XXXNP: Who is responsible for cleaning up the socket if this returns with an * error? Review all error paths. * * XXXNP: What happens to the socket's fd reference if the operation is * successful, and how does that affect the socket's life cycle? */ int icl_cxgbei_conn_handoff(struct icl_conn *ic, int fd) { struct icl_cxgbei_conn *icc = ic_to_icc(ic); struct cxgbei_data *ci; struct find_ofld_adapter_rr fa; struct file *fp; struct socket *so; struct inpcb *inp; struct tcpcb *tp; struct toepcb *toep; cap_rights_t rights; int error, max_iso_pdus; MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); ICL_CONN_LOCK_ASSERT_NOT(ic); /* * Steal the socket from userland. */ error = fget(curthread, fd, cap_rights_init_one(&rights, CAP_SOCK_CLIENT), &fp); if (error != 0) return (error); if (fp->f_type != DTYPE_SOCKET) { fdrop(fp, curthread); return (EINVAL); } so = fp->f_data; if (so->so_type != SOCK_STREAM || so->so_proto->pr_protocol != IPPROTO_TCP) { fdrop(fp, curthread); return (EINVAL); } ICL_CONN_LOCK(ic); if (ic->ic_socket != NULL) { ICL_CONN_UNLOCK(ic); fdrop(fp, curthread); return (EBUSY); } ic->ic_disconnecting = false; ic->ic_socket = so; fp->f_ops = &badfileops; fp->f_data = NULL; fdrop(fp, curthread); ICL_CONN_UNLOCK(ic); /* Find the adapter offloading this socket. */ fa.sc = NULL; fa.so = so; t4_iterate(find_offload_adapter, &fa); if (fa.sc == NULL) return (EINVAL); icc->sc = fa.sc; ci = icc->sc->iscsi_ulp_softc; inp = sotoinpcb(so); INP_WLOCK(inp); tp = intotcpcb(inp); if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) error = EBUSY; else { /* * socket could not have been "unoffloaded" if here. */ MPASS(tp->t_flags & TF_TOE); MPASS(tp->tod != NULL); MPASS(tp->t_toe != NULL); toep = tp->t_toe; MPASS(toep->vi->adapter == icc->sc); icc->toep = toep; icc->cwt = cxgbei_select_worker_thread(icc); icc->ulp_submode = 0; if (ic->ic_header_crc32c) icc->ulp_submode |= ULP_CRC_HEADER; if (ic->ic_data_crc32c) icc->ulp_submode |= ULP_CRC_DATA; if (icc->sc->tt.iso && chip_id(icc->sc) >= CHELSIO_T5) { max_iso_pdus = CXGBEI_MAX_ISO_PAYLOAD / ci->max_tx_pdu_len; ic->ic_hw_isomax = max_iso_pdus * ic->ic_max_send_data_segment_length; } else max_iso_pdus = 1; so->so_options |= SO_NO_DDP; toep->params.ulp_mode = ULP_MODE_ISCSI; toep->ulpcb = icc; send_iscsi_flowc_wr(icc->sc, toep, roundup(max_iso_pdus * ci->max_tx_pdu_len, tp->t_maxseg)); set_ulp_mode_iscsi(icc->sc, toep, icc->ulp_submode); error = 0; } INP_WUNLOCK(inp); if (error == 0) { error = icl_cxgbei_setsockopt(ic, so, ci->max_tx_pdu_len, ci->max_rx_pdu_len); } return (error); } void icl_cxgbei_conn_close(struct icl_conn *ic) { struct icl_cxgbei_conn *icc = ic_to_icc(ic); struct icl_pdu *ip; struct socket *so; struct sockbuf *sb; struct inpcb *inp; struct toepcb *toep = icc->toep; MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); ICL_CONN_LOCK_ASSERT_NOT(ic); ICL_CONN_LOCK(ic); so = ic->ic_socket; if (ic->ic_disconnecting || so == NULL) { CTR4(KTR_CXGBE, "%s: icc %p (disconnecting = %d), so %p", __func__, icc, ic->ic_disconnecting, so); ICL_CONN_UNLOCK(ic); return; } ic->ic_disconnecting = true; #ifdef DIAGNOSTIC KASSERT(ic->ic_outstanding_pdus == 0, ("destroying session with %d outstanding PDUs", ic->ic_outstanding_pdus)); #endif ICL_CONN_UNLOCK(ic); CTR3(KTR_CXGBE, "%s: tid %d, icc %p", __func__, toep ? toep->tid : -1, icc); inp = sotoinpcb(so); sb = &so->so_rcv; INP_WLOCK(inp); if (toep != NULL) { /* NULL if connection was never offloaded. */ toep->ulpcb = NULL; /* Discard PDUs queued for TX. */ mbufq_drain(&toep->ulp_pduq); /* * Wait for the cwt threads to stop processing this * connection. */ SOCKBUF_LOCK(sb); if (icc->rx_flags & RXF_ACTIVE) { volatile u_int *p = &icc->rx_flags; SOCKBUF_UNLOCK(sb); INP_WUNLOCK(inp); while (*p & RXF_ACTIVE) pause("conclo", 1); INP_WLOCK(inp); SOCKBUF_LOCK(sb); } /* * Discard received PDUs not passed to the iSCSI * layer. */ while (!STAILQ_EMPTY(&icc->rcvd_pdus)) { ip = STAILQ_FIRST(&icc->rcvd_pdus); STAILQ_REMOVE_HEAD(&icc->rcvd_pdus, ip_next); icl_cxgbei_pdu_done(ip, ENOTCONN); } SOCKBUF_UNLOCK(sb); + + /* + * Grab a reference to use when waiting for the final + * CPL to be received. If toep->inp is NULL, then + * final_cpl_received() has already been called (e.g. + * due to the peer sending a RST). + */ + if (toep->inp != NULL) { + toep = hold_toepcb(toep); + toep->flags |= TPF_WAITING_FOR_FINAL; + } else + toep = NULL; } INP_WUNLOCK(inp); ICL_CONN_LOCK(ic); ic->ic_socket = NULL; ICL_CONN_UNLOCK(ic); /* * XXXNP: we should send RST instead of FIN when PDUs held in various * queues were purged instead of delivered reliably but soabort isn't * really general purpose and wouldn't do the right thing here. */ - soref(so); soclose(so); /* * Wait for the socket to fully close. This ensures any * pending received data has been received (and in particular, * any data that would be received by DDP has been handled). * Callers assume that it is safe to free buffers for tasks * and transfers after this function returns. */ - SOCK_LOCK(so); - while ((so->so_state & SS_ISDISCONNECTED) == 0) - mtx_sleep(&so->so_timeo, SOCK_MTX(so), PSOCK, "conclo2", 0); - CURVNET_SET(so->so_vnet); - sorele(so); - CURVNET_RESTORE(); + if (toep != NULL) { + struct mtx *lock = mtx_pool_find(mtxpool_sleep, toep); + + mtx_lock(lock); + while ((toep->flags & TPF_WAITING_FOR_FINAL) != 0) + mtx_sleep(toep, lock, PSOCK, "conclo2", 0); + mtx_unlock(lock); + free_toepcb(toep); + } } static void cxgbei_insert_cmp(struct icl_cxgbei_conn *icc, struct cxgbei_cmp *cmp, uint32_t tt) { #ifdef INVARIANTS struct cxgbei_cmp *cmp2; #endif cmp->tt = tt; mtx_lock(&icc->cmp_lock); #ifdef INVARIANTS LIST_FOREACH(cmp2, &icc->cmp_table[TT_HASH(icc, tt)], link) { KASSERT(cmp2->tt != tt, ("%s: duplicate cmp", __func__)); } #endif LIST_INSERT_HEAD(&icc->cmp_table[TT_HASH(icc, tt)], cmp, link); mtx_unlock(&icc->cmp_lock); } struct cxgbei_cmp * cxgbei_find_cmp(struct icl_cxgbei_conn *icc, uint32_t tt) { struct cxgbei_cmp *cmp; mtx_lock(&icc->cmp_lock); LIST_FOREACH(cmp, &icc->cmp_table[TT_HASH(icc, tt)], link) { if (cmp->tt == tt) break; } mtx_unlock(&icc->cmp_lock); return (cmp); } static void cxgbei_rm_cmp(struct icl_cxgbei_conn *icc, struct cxgbei_cmp *cmp) { #ifdef INVARIANTS struct cxgbei_cmp *cmp2; #endif mtx_lock(&icc->cmp_lock); #ifdef INVARIANTS LIST_FOREACH(cmp2, &icc->cmp_table[TT_HASH(icc, cmp->tt)], link) { if (cmp2 == cmp) goto found; } panic("%s: could not find cmp", __func__); found: #endif LIST_REMOVE(cmp, link); mtx_unlock(&icc->cmp_lock); } int icl_cxgbei_conn_task_setup(struct icl_conn *ic, struct icl_pdu *ip, struct ccb_scsiio *csio, uint32_t *ittp, void **arg) { struct icl_cxgbei_conn *icc = ic_to_icc(ic); struct toepcb *toep = icc->toep; struct adapter *sc = icc->sc; struct cxgbei_data *ci = sc->iscsi_ulp_softc; struct ppod_region *pr = &ci->pr; struct cxgbei_ddp_state *ddp; struct ppod_reservation *prsv; struct inpcb *inp; struct mbufq mq; uint32_t itt; int rc = 0; ICL_CONN_LOCK_ASSERT(ic); /* This is for the offload driver's state. Must not be set already. */ MPASS(arg != NULL); MPASS(*arg == NULL); if (ic->ic_disconnecting || ic->ic_socket == NULL) return (ECONNRESET); if ((csio->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_IN || csio->dxfer_len < ci->ddp_threshold) { no_ddp: /* * No DDP for this I/O. Allocate an ITT (based on the one * passed in) that cannot be a valid hardware DDP tag in the * iSCSI region. */ itt = *ittp & M_PPOD_TAG; itt = V_PPOD_TAG(itt) | pr->pr_invalid_bit; *ittp = htobe32(itt); MPASS(*arg == NULL); /* State is maintained for DDP only. */ if (rc != 0) counter_u64_add( toep->ofld_rxq->rx_iscsi_ddp_setup_error, 1); return (0); } /* * Reserve resources for DDP, update the itt that should be used in the * PDU, and save DDP specific state for this I/O in *arg. */ ddp = malloc(sizeof(*ddp), M_CXGBEI, M_NOWAIT | M_ZERO); if (ddp == NULL) { rc = ENOMEM; goto no_ddp; } prsv = &ddp->prsv; /* XXX add support for all CAM_DATA_ types */ MPASS((csio->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_VADDR); rc = t4_alloc_page_pods_for_buf(pr, (vm_offset_t)csio->data_ptr, csio->dxfer_len, prsv); if (rc != 0) { free(ddp, M_CXGBEI); goto no_ddp; } mbufq_init(&mq, INT_MAX); rc = t4_write_page_pods_for_buf(sc, toep, prsv, (vm_offset_t)csio->data_ptr, csio->dxfer_len, &mq); if (__predict_false(rc != 0)) { mbufq_drain(&mq); t4_free_page_pods(prsv); free(ddp, M_CXGBEI); goto no_ddp; } /* * Do not get inp from toep->inp as the toepcb might have * detached already. */ inp = sotoinpcb(ic->ic_socket); INP_WLOCK(inp); if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) != 0) { INP_WUNLOCK(inp); mbufq_drain(&mq); t4_free_page_pods(prsv); free(ddp, M_CXGBEI); return (ECONNRESET); } mbufq_concat(&toep->ulp_pduq, &mq); INP_WUNLOCK(inp); ddp->cmp.last_datasn = -1; cxgbei_insert_cmp(icc, &ddp->cmp, prsv->prsv_tag); *ittp = htobe32(prsv->prsv_tag); *arg = prsv; counter_u64_add(toep->ofld_rxq->rx_iscsi_ddp_setup_ok, 1); return (0); } void icl_cxgbei_conn_task_done(struct icl_conn *ic, void *arg) { if (arg != NULL) { struct cxgbei_ddp_state *ddp = arg; cxgbei_rm_cmp(ic_to_icc(ic), &ddp->cmp); t4_free_page_pods(&ddp->prsv); free(ddp, M_CXGBEI); } } static inline bool ddp_sgl_check(struct ctl_sg_entry *sg, int entries, int xferlen) { int total_len = 0; MPASS(entries > 0); if (((vm_offset_t)sg[--entries].addr & 3U) != 0) return (false); total_len += sg[entries].len; while (--entries >= 0) { if (((vm_offset_t)sg[entries].addr & PAGE_MASK) != 0 || (sg[entries].len % PAGE_SIZE) != 0) return (false); total_len += sg[entries].len; } MPASS(total_len == xferlen); return (true); } /* XXXNP: PDU should be passed in as parameter, like on the initiator. */ #define io_to_request_pdu(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr) #define io_to_ddp_state(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND2].ptr) int icl_cxgbei_conn_transfer_setup(struct icl_conn *ic, union ctl_io *io, uint32_t *tttp, void **arg) { struct icl_cxgbei_conn *icc = ic_to_icc(ic); struct toepcb *toep = icc->toep; struct ctl_scsiio *ctsio = &io->scsiio; struct adapter *sc = icc->sc; struct cxgbei_data *ci = sc->iscsi_ulp_softc; struct ppod_region *pr = &ci->pr; struct cxgbei_ddp_state *ddp; struct ppod_reservation *prsv; struct ctl_sg_entry *sgl, sg_entry; struct inpcb *inp; struct mbufq mq; int sg_entries = ctsio->kern_sg_entries; uint32_t ttt; int xferlen, rc = 0, alias; /* This is for the offload driver's state. Must not be set already. */ MPASS(arg != NULL); MPASS(*arg == NULL); if (ctsio->ext_data_filled == 0) { int first_burst; struct icl_pdu *ip = io_to_request_pdu(io); #ifdef INVARIANTS struct icl_cxgbei_pdu *icp = ip_to_icp(ip); MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE); MPASS(ic == ip->ip_conn); MPASS(ip->ip_bhs_mbuf != NULL); #endif first_burst = icl_pdu_data_segment_length(ip); /* * Note that ICL calls conn_transfer_setup even if the first * burst had everything and there's nothing left to transfer. * * NB: The CTL frontend might have provided a buffer * whose length (kern_data_len) is smaller than the * FirstBurstLength of unsolicited data. Treat those * as an empty transfer. */ xferlen = ctsio->kern_data_len; if (xferlen < first_burst || xferlen - first_burst < ci->ddp_threshold) { no_ddp: /* * No DDP for this transfer. Allocate a TTT (based on * the one passed in) that cannot be a valid hardware * DDP tag in the iSCSI region. */ ttt = *tttp & M_PPOD_TAG; ttt = V_PPOD_TAG(ttt) | pr->pr_invalid_bit; *tttp = htobe32(ttt); MPASS(io_to_ddp_state(io) == NULL); if (rc != 0) counter_u64_add( toep->ofld_rxq->rx_iscsi_ddp_setup_error, 1); return (0); } if (sg_entries == 0) { sgl = &sg_entry; sgl->len = xferlen; sgl->addr = (void *)ctsio->kern_data_ptr; sg_entries = 1; } else sgl = (void *)ctsio->kern_data_ptr; if (!ddp_sgl_check(sgl, sg_entries, xferlen)) goto no_ddp; /* * Reserve resources for DDP, update the ttt that should be used * in the PDU, and save DDP specific state for this I/O. */ MPASS(io_to_ddp_state(io) == NULL); ddp = malloc(sizeof(*ddp), M_CXGBEI, M_NOWAIT | M_ZERO); if (ddp == NULL) { rc = ENOMEM; goto no_ddp; } prsv = &ddp->prsv; rc = t4_alloc_page_pods_for_sgl(pr, sgl, sg_entries, prsv); if (rc != 0) { free(ddp, M_CXGBEI); goto no_ddp; } mbufq_init(&mq, INT_MAX); rc = t4_write_page_pods_for_sgl(sc, toep, prsv, sgl, sg_entries, xferlen, &mq); if (__predict_false(rc != 0)) { mbufq_drain(&mq); t4_free_page_pods(prsv); free(ddp, M_CXGBEI); goto no_ddp; } /* * Do not get inp from toep->inp as the toepcb might * have detached already. */ ICL_CONN_LOCK(ic); if (ic->ic_disconnecting || ic->ic_socket == NULL) { ICL_CONN_UNLOCK(ic); mbufq_drain(&mq); t4_free_page_pods(prsv); free(ddp, M_CXGBEI); return (ECONNRESET); } inp = sotoinpcb(ic->ic_socket); INP_WLOCK(inp); ICL_CONN_UNLOCK(ic); if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) != 0) { INP_WUNLOCK(inp); mbufq_drain(&mq); t4_free_page_pods(prsv); free(ddp, M_CXGBEI); return (ECONNRESET); } mbufq_concat(&toep->ulp_pduq, &mq); INP_WUNLOCK(inp); ddp->cmp.next_buffer_offset = ctsio->kern_rel_offset + first_burst; ddp->cmp.last_datasn = -1; cxgbei_insert_cmp(icc, &ddp->cmp, prsv->prsv_tag); *tttp = htobe32(prsv->prsv_tag); io_to_ddp_state(io) = ddp; *arg = ctsio; counter_u64_add(toep->ofld_rxq->rx_iscsi_ddp_setup_ok, 1); return (0); } /* * In the middle of an I/O. A non-NULL page pod reservation indicates * that a DDP buffer is being used for the I/O. */ ddp = io_to_ddp_state(ctsio); if (ddp == NULL) goto no_ddp; prsv = &ddp->prsv; alias = (prsv->prsv_tag & pr->pr_alias_mask) >> pr->pr_alias_shift; alias++; prsv->prsv_tag &= ~pr->pr_alias_mask; prsv->prsv_tag |= alias << pr->pr_alias_shift & pr->pr_alias_mask; ddp->cmp.next_datasn = 0; ddp->cmp.last_datasn = -1; cxgbei_insert_cmp(icc, &ddp->cmp, prsv->prsv_tag); *tttp = htobe32(prsv->prsv_tag); *arg = ctsio; return (0); } void icl_cxgbei_conn_transfer_done(struct icl_conn *ic, void *arg) { struct ctl_scsiio *ctsio = arg; if (ctsio != NULL) { struct cxgbei_ddp_state *ddp; ddp = io_to_ddp_state(ctsio); MPASS(ddp != NULL); cxgbei_rm_cmp(ic_to_icc(ic), &ddp->cmp); if (ctsio->kern_data_len == ctsio->ext_data_filled || ic->ic_disconnecting) { t4_free_page_pods(&ddp->prsv); free(ddp, M_CXGBEI); io_to_ddp_state(ctsio) = NULL; } } } static void cxgbei_limits(struct adapter *sc, void *arg) { struct icl_drv_limits *idl = arg; struct cxgbei_data *ci; int max_dsl; if (begin_synchronized_op(sc, NULL, HOLD_LOCK, "t4lims") != 0) return; if (uld_active(sc, ULD_ISCSI)) { ci = sc->iscsi_ulp_softc; MPASS(ci != NULL); /* * AHS is not supported by the kernel so we'll not account for * it either in our PDU len -> data segment len conversions. */ max_dsl = ci->max_rx_pdu_len - ISCSI_BHS_SIZE - ISCSI_HEADER_DIGEST_SIZE - ISCSI_DATA_DIGEST_SIZE; if (idl->idl_max_recv_data_segment_length > max_dsl) idl->idl_max_recv_data_segment_length = max_dsl; max_dsl = ci->max_tx_pdu_len - ISCSI_BHS_SIZE - ISCSI_HEADER_DIGEST_SIZE - ISCSI_DATA_DIGEST_SIZE; if (idl->idl_max_send_data_segment_length > max_dsl) idl->idl_max_send_data_segment_length = max_dsl; } end_synchronized_op(sc, LOCK_HELD); } static int icl_cxgbei_limits(struct icl_drv_limits *idl) { /* Maximum allowed by the RFC. cxgbei_limits will clip them. */ idl->idl_max_recv_data_segment_length = (1 << 24) - 1; idl->idl_max_send_data_segment_length = (1 << 24) - 1; /* These are somewhat arbitrary. */ idl->idl_max_burst_length = max_burst_length; idl->idl_first_burst_length = first_burst_length; t4_iterate(cxgbei_limits, idl); return (0); } int icl_cxgbei_mod_load(void) { int rc; refcount_init(&icl_cxgbei_ncons, 0); rc = icl_register("cxgbei", false, -100, icl_cxgbei_limits, icl_cxgbei_new_conn); return (rc); } int icl_cxgbei_mod_unload(void) { if (icl_cxgbei_ncons != 0) return (EBUSY); icl_unregister("cxgbei", false); return (0); } #endif diff --git a/sys/dev/cxgbe/tom/t4_tom.c b/sys/dev/cxgbe/tom/t4_tom.c index f3cd8270d24d..fdd0ab870f43 100644 --- a/sys/dev/cxgbe/tom/t4_tom.c +++ b/sys/dev/cxgbe/tom/t4_tom.c @@ -1,1951 +1,1961 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_ratelimit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include #include #include #include #include #ifdef TCP_OFFLOAD #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "common/t4_tcb.h" #include "t4_clip.h" #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" #include "tom/t4_tls.h" static struct protosw *tcp_protosw; static struct protosw toe_protosw; static struct pr_usrreqs toe_usrreqs; static struct protosw *tcp6_protosw; static struct protosw toe6_protosw; static struct pr_usrreqs toe6_usrreqs; /* Module ops */ static int t4_tom_mod_load(void); static int t4_tom_mod_unload(void); static int t4_tom_modevent(module_t, int, void *); /* ULD ops and helpers */ static int t4_tom_activate(struct adapter *); static int t4_tom_deactivate(struct adapter *); static struct uld_info tom_uld_info = { .uld_id = ULD_TOM, .activate = t4_tom_activate, .deactivate = t4_tom_deactivate, }; static void release_offload_resources(struct toepcb *); static int alloc_tid_tabs(struct tid_info *); static void free_tid_tabs(struct tid_info *); static void free_tom_data(struct adapter *, struct tom_data *); static void reclaim_wr_resources(void *, int); struct toepcb * alloc_toepcb(struct vi_info *vi, int flags) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct toepcb *toep; int tx_credits, txsd_total, len; /* * The firmware counts tx work request credits in units of 16 bytes * each. Reserve room for an ABORT_REQ so the driver never has to worry * about tx credits if it wants to abort a connection. */ tx_credits = sc->params.ofldq_wr_cred; tx_credits -= howmany(sizeof(struct cpl_abort_req), 16); /* * Shortest possible tx work request is a fw_ofld_tx_data_wr + 1 byte * immediate payload, and firmware counts tx work request credits in * units of 16 byte. Calculate the maximum work requests possible. */ txsd_total = tx_credits / howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16); len = offsetof(struct toepcb, txsd) + txsd_total * sizeof(struct ofld_tx_sdesc); toep = malloc(len, M_CXGBE, M_ZERO | flags); if (toep == NULL) return (NULL); refcount_init(&toep->refcount, 1); toep->td = sc->tom_softc; toep->vi = vi; toep->tid = -1; toep->tx_total = tx_credits; toep->tx_credits = tx_credits; mbufq_init(&toep->ulp_pduq, INT_MAX); mbufq_init(&toep->ulp_pdu_reclaimq, INT_MAX); toep->txsd_total = txsd_total; toep->txsd_avail = txsd_total; toep->txsd_pidx = 0; toep->txsd_cidx = 0; aiotx_init_toep(toep); return (toep); } /* * Initialize a toepcb after its params have been filled out. */ int init_toepcb(struct vi_info *vi, struct toepcb *toep) { struct conn_params *cp = &toep->params; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct tx_cl_rl_params *tc; if (cp->tc_idx >= 0 && cp->tc_idx < sc->params.nsched_cls) { tc = &pi->sched_params->cl_rl[cp->tc_idx]; mtx_lock(&sc->tc_lock); if (tc->state != CS_HW_CONFIGURED) { CH_ERR(vi, "tid %d cannot be bound to traffic class %d " "because it is not configured (its state is %d)\n", toep->tid, cp->tc_idx, tc->state); cp->tc_idx = -1; } else { tc->refcount++; } mtx_unlock(&sc->tc_lock); } toep->ofld_txq = &sc->sge.ofld_txq[cp->txq_idx]; toep->ofld_rxq = &sc->sge.ofld_rxq[cp->rxq_idx]; toep->ctrlq = &sc->sge.ctrlq[pi->port_id]; tls_init_toep(toep); if (ulp_mode(toep) == ULP_MODE_TCPDDP) ddp_init_toep(toep); toep->flags |= TPF_INITIALIZED; return (0); } struct toepcb * hold_toepcb(struct toepcb *toep) { refcount_acquire(&toep->refcount); return (toep); } void free_toepcb(struct toepcb *toep) { if (refcount_release(&toep->refcount) == 0) return; KASSERT(!(toep->flags & TPF_ATTACHED), ("%s: attached to an inpcb", __func__)); KASSERT(!(toep->flags & TPF_CPL_PENDING), ("%s: CPL pending", __func__)); if (toep->flags & TPF_INITIALIZED) { if (ulp_mode(toep) == ULP_MODE_TCPDDP) ddp_uninit_toep(toep); tls_uninit_toep(toep); } free(toep, M_CXGBE); } /* * Set up the socket for TCP offload. */ void offload_socket(struct socket *so, struct toepcb *toep) { struct tom_data *td = toep->td; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct sockbuf *sb; INP_WLOCK_ASSERT(inp); /* Update socket */ sb = &so->so_snd; SOCKBUF_LOCK(sb); sb->sb_flags |= SB_NOCOALESCE; SOCKBUF_UNLOCK(sb); sb = &so->so_rcv; SOCKBUF_LOCK(sb); sb->sb_flags |= SB_NOCOALESCE; if (inp->inp_vflag & INP_IPV6) so->so_proto = &toe6_protosw; else so->so_proto = &toe_protosw; SOCKBUF_UNLOCK(sb); /* Update TCP PCB */ tp->tod = &td->tod; tp->t_toe = toep; tp->t_flags |= TF_TOE; /* Install an extra hold on inp */ toep->inp = inp; toep->flags |= TPF_ATTACHED; in_pcbref(inp); /* Add the TOE PCB to the active list */ mtx_lock(&td->toep_list_lock); TAILQ_INSERT_HEAD(&td->toep_list, toep, link); mtx_unlock(&td->toep_list_lock); } void restore_so_proto(struct socket *so, bool v6) { if (v6) so->so_proto = tcp6_protosw; else so->so_proto = tcp_protosw; } /* This is _not_ the normal way to "unoffload" a socket. */ void undo_offload_socket(struct socket *so) { struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct toepcb *toep = tp->t_toe; struct tom_data *td = toep->td; struct sockbuf *sb; INP_WLOCK_ASSERT(inp); sb = &so->so_snd; SOCKBUF_LOCK(sb); sb->sb_flags &= ~SB_NOCOALESCE; SOCKBUF_UNLOCK(sb); sb = &so->so_rcv; SOCKBUF_LOCK(sb); sb->sb_flags &= ~SB_NOCOALESCE; restore_so_proto(so, inp->inp_vflag & INP_IPV6); SOCKBUF_UNLOCK(sb); tp->tod = NULL; tp->t_toe = NULL; tp->t_flags &= ~TF_TOE; toep->inp = NULL; toep->flags &= ~TPF_ATTACHED; if (in_pcbrele_wlocked(inp)) panic("%s: inp freed.", __func__); mtx_lock(&td->toep_list_lock); TAILQ_REMOVE(&td->toep_list, toep, link); mtx_unlock(&td->toep_list_lock); } static void release_offload_resources(struct toepcb *toep) { struct tom_data *td = toep->td; struct adapter *sc = td_adapter(td); int tid = toep->tid; KASSERT(!(toep->flags & TPF_CPL_PENDING), ("%s: %p has CPL pending.", __func__, toep)); KASSERT(!(toep->flags & TPF_ATTACHED), ("%s: %p is still attached.", __func__, toep)); CTR5(KTR_CXGBE, "%s: toep %p (tid %d, l2te %p, ce %p)", __func__, toep, tid, toep->l2te, toep->ce); /* * These queues should have been emptied at approximately the same time * that a normal connection's socket's so_snd would have been purged or * drained. Do _not_ clean up here. */ MPASS(mbufq_len(&toep->ulp_pduq) == 0); MPASS(mbufq_len(&toep->ulp_pdu_reclaimq) == 0); #ifdef INVARIANTS if (ulp_mode(toep) == ULP_MODE_TCPDDP) ddp_assert_empty(toep); #endif MPASS(TAILQ_EMPTY(&toep->aiotx_jobq)); if (toep->l2te) t4_l2t_release(toep->l2te); if (tid >= 0) { remove_tid(sc, tid, toep->ce ? 2 : 1); release_tid(sc, tid, toep->ctrlq); } if (toep->ce) t4_release_clip_entry(sc, toep->ce); if (toep->params.tc_idx != -1) t4_release_cl_rl(sc, toep->vi->pi->port_id, toep->params.tc_idx); mtx_lock(&td->toep_list_lock); TAILQ_REMOVE(&td->toep_list, toep, link); mtx_unlock(&td->toep_list_lock); free_toepcb(toep); } /* * The kernel is done with the TCP PCB and this is our opportunity to unhook the * toepcb hanging off of it. If the TOE driver is also done with the toepcb (no * pending CPL) then it is time to release all resources tied to the toepcb. * * Also gets called when an offloaded active open fails and the TOM wants the * kernel to take the TCP PCB back. */ static void t4_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp) { #if defined(KTR) || defined(INVARIANTS) struct inpcb *inp = tp->t_inpcb; #endif struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(inp); KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); KASSERT(toep->flags & TPF_ATTACHED, ("%s: not attached", __func__)); #ifdef KTR if (tp->t_state == TCPS_SYN_SENT) { CTR6(KTR_CXGBE, "%s: atid %d, toep %p (0x%x), inp %p (0x%x)", __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags); } else { CTR6(KTR_CXGBE, "t4_pcb_detach: tid %d (%s), toep %p (0x%x), inp %p (0x%x)", toep->tid, tcpstates[tp->t_state], toep, toep->flags, inp, inp->inp_flags); } #endif if (ulp_mode(toep) == ULP_MODE_TLS) tls_detach(toep); tp->tod = NULL; tp->t_toe = NULL; tp->t_flags &= ~TF_TOE; toep->flags &= ~TPF_ATTACHED; if (!(toep->flags & TPF_CPL_PENDING)) release_offload_resources(toep); } /* * setsockopt handler. */ static void t4_ctloutput(struct toedev *tod, struct tcpcb *tp, int dir, int name) { struct adapter *sc = tod->tod_softc; struct toepcb *toep = tp->t_toe; if (dir == SOPT_GET) return; CTR4(KTR_CXGBE, "%s: tp %p, dir %u, name %u", __func__, tp, dir, name); switch (name) { case TCP_NODELAY: if (tp->t_state != TCPS_ESTABLISHED) break; toep->params.nagle = tp->t_flags & TF_NODELAY ? 0 : 1; t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS, V_TF_NAGLE(1), V_TF_NAGLE(toep->params.nagle), 0, 0); break; default: break; } } static inline uint64_t get_tcb_tflags(const uint64_t *tcb) { return ((be64toh(tcb[14]) << 32) | (be64toh(tcb[15]) >> 32)); } static inline uint32_t get_tcb_field(const uint64_t *tcb, u_int word, uint32_t mask, u_int shift) { #define LAST_WORD ((TCB_SIZE / 4) - 1) uint64_t t1, t2; int flit_idx; MPASS(mask != 0); MPASS(word <= LAST_WORD); MPASS(shift < 32); flit_idx = (LAST_WORD - word) / 2; if (word & 0x1) shift += 32; t1 = be64toh(tcb[flit_idx]) >> shift; t2 = 0; if (fls(mask) > 64 - shift) { /* * Will spill over into the next logical flit, which is the flit * before this one. The flit_idx before this one must be valid. */ MPASS(flit_idx > 0); t2 = be64toh(tcb[flit_idx - 1]) << (64 - shift); } return ((t2 | t1) & mask); #undef LAST_WORD } #define GET_TCB_FIELD(tcb, F) \ get_tcb_field(tcb, W_TCB_##F, M_TCB_##F, S_TCB_##F) /* * Issues a CPL_GET_TCB to read the entire TCB for the tid. */ static int send_get_tcb(struct adapter *sc, u_int tid) { struct cpl_get_tcb *cpl; struct wrq_cookie cookie; MPASS(tid < sc->tids.ntids); cpl = start_wrq_wr(&sc->sge.ctrlq[0], howmany(sizeof(*cpl), 16), &cookie); if (__predict_false(cpl == NULL)) return (ENOMEM); bzero(cpl, sizeof(*cpl)); INIT_TP_WR(cpl, tid); OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_GET_TCB, tid)); cpl->reply_ctrl = htobe16(V_REPLY_CHAN(0) | V_QUEUENO(sc->sge.ofld_rxq[0].iq.cntxt_id)); cpl->cookie = 0xff; commit_wrq_wr(&sc->sge.ctrlq[0], cpl, &cookie); return (0); } static struct tcb_histent * alloc_tcb_histent(struct adapter *sc, u_int tid, int flags) { struct tcb_histent *te; MPASS(flags == M_NOWAIT || flags == M_WAITOK); te = malloc(sizeof(*te), M_CXGBE, M_ZERO | flags); if (te == NULL) return (NULL); mtx_init(&te->te_lock, "TCB entry", NULL, MTX_DEF); callout_init_mtx(&te->te_callout, &te->te_lock, 0); te->te_adapter = sc; te->te_tid = tid; return (te); } static void free_tcb_histent(struct tcb_histent *te) { mtx_destroy(&te->te_lock); free(te, M_CXGBE); } /* * Start tracking the tid in the TCB history. */ int add_tid_to_history(struct adapter *sc, u_int tid) { struct tcb_histent *te = NULL; struct tom_data *td = sc->tom_softc; int rc; MPASS(tid < sc->tids.ntids); if (td->tcb_history == NULL) return (ENXIO); rw_wlock(&td->tcb_history_lock); if (td->tcb_history[tid] != NULL) { rc = EEXIST; goto done; } te = alloc_tcb_histent(sc, tid, M_NOWAIT); if (te == NULL) { rc = ENOMEM; goto done; } mtx_lock(&te->te_lock); rc = send_get_tcb(sc, tid); if (rc == 0) { te->te_flags |= TE_RPL_PENDING; td->tcb_history[tid] = te; } else { free(te, M_CXGBE); } mtx_unlock(&te->te_lock); done: rw_wunlock(&td->tcb_history_lock); return (rc); } static void remove_tcb_histent(struct tcb_histent *te) { struct adapter *sc = te->te_adapter; struct tom_data *td = sc->tom_softc; rw_assert(&td->tcb_history_lock, RA_WLOCKED); mtx_assert(&te->te_lock, MA_OWNED); MPASS(td->tcb_history[te->te_tid] == te); td->tcb_history[te->te_tid] = NULL; free_tcb_histent(te); rw_wunlock(&td->tcb_history_lock); } static inline struct tcb_histent * lookup_tcb_histent(struct adapter *sc, u_int tid, bool addrem) { struct tcb_histent *te; struct tom_data *td = sc->tom_softc; MPASS(tid < sc->tids.ntids); if (td->tcb_history == NULL) return (NULL); if (addrem) rw_wlock(&td->tcb_history_lock); else rw_rlock(&td->tcb_history_lock); te = td->tcb_history[tid]; if (te != NULL) { mtx_lock(&te->te_lock); return (te); /* with both locks held */ } if (addrem) rw_wunlock(&td->tcb_history_lock); else rw_runlock(&td->tcb_history_lock); return (te); } static inline void release_tcb_histent(struct tcb_histent *te) { struct adapter *sc = te->te_adapter; struct tom_data *td = sc->tom_softc; mtx_assert(&te->te_lock, MA_OWNED); mtx_unlock(&te->te_lock); rw_assert(&td->tcb_history_lock, RA_RLOCKED); rw_runlock(&td->tcb_history_lock); } static void request_tcb(void *arg) { struct tcb_histent *te = arg; mtx_assert(&te->te_lock, MA_OWNED); /* Noone else is supposed to update the histent. */ MPASS(!(te->te_flags & TE_RPL_PENDING)); if (send_get_tcb(te->te_adapter, te->te_tid) == 0) te->te_flags |= TE_RPL_PENDING; else callout_schedule(&te->te_callout, hz / 100); } static void update_tcb_histent(struct tcb_histent *te, const uint64_t *tcb) { struct tom_data *td = te->te_adapter->tom_softc; uint64_t tflags = get_tcb_tflags(tcb); uint8_t sample = 0; if (GET_TCB_FIELD(tcb, SND_MAX_RAW) != GET_TCB_FIELD(tcb, SND_UNA_RAW)) { if (GET_TCB_FIELD(tcb, T_RXTSHIFT) != 0) sample |= TS_RTO; if (GET_TCB_FIELD(tcb, T_DUPACKS) != 0) sample |= TS_DUPACKS; if (GET_TCB_FIELD(tcb, T_DUPACKS) >= td->dupack_threshold) sample |= TS_FASTREXMT; } if (GET_TCB_FIELD(tcb, SND_MAX_RAW) != 0) { uint32_t snd_wnd; sample |= TS_SND_BACKLOGGED; /* for whatever reason. */ snd_wnd = GET_TCB_FIELD(tcb, RCV_ADV); if (tflags & V_TF_RECV_SCALE(1)) snd_wnd <<= GET_TCB_FIELD(tcb, RCV_SCALE); if (GET_TCB_FIELD(tcb, SND_CWND) < snd_wnd) sample |= TS_CWND_LIMITED; /* maybe due to CWND */ } if (tflags & V_TF_CCTRL_ECN(1)) { /* * CE marker on incoming IP hdr, echoing ECE back in the TCP * hdr. Indicates congestion somewhere on the way from the peer * to this node. */ if (tflags & V_TF_CCTRL_ECE(1)) sample |= TS_ECN_ECE; /* * ECE seen and CWR sent (or about to be sent). Might indicate * congestion on the way to the peer. This node is reducing its * congestion window in response. */ if (tflags & (V_TF_CCTRL_CWR(1) | V_TF_CCTRL_RFR(1))) sample |= TS_ECN_CWR; } te->te_sample[te->te_pidx] = sample; if (++te->te_pidx == nitems(te->te_sample)) te->te_pidx = 0; memcpy(te->te_tcb, tcb, TCB_SIZE); te->te_flags |= TE_ACTIVE; } static int do_get_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_get_tcb_rpl *cpl = mtod(m, const void *); const uint64_t *tcb = (const uint64_t *)(const void *)(cpl + 1); struct tcb_histent *te; const u_int tid = GET_TID(cpl); bool remove; remove = GET_TCB_FIELD(tcb, T_STATE) == TCPS_CLOSED; te = lookup_tcb_histent(sc, tid, remove); if (te == NULL) { /* Not in the history. Who issued the GET_TCB for this? */ device_printf(sc->dev, "tcb %u: flags 0x%016jx, state %u, " "srtt %u, sscale %u, rscale %u, cookie 0x%x\n", tid, (uintmax_t)get_tcb_tflags(tcb), GET_TCB_FIELD(tcb, T_STATE), GET_TCB_FIELD(tcb, T_SRTT), GET_TCB_FIELD(tcb, SND_SCALE), GET_TCB_FIELD(tcb, RCV_SCALE), cpl->cookie); goto done; } MPASS(te->te_flags & TE_RPL_PENDING); te->te_flags &= ~TE_RPL_PENDING; if (remove) { remove_tcb_histent(te); } else { update_tcb_histent(te, tcb); callout_reset(&te->te_callout, hz / 10, request_tcb, te); release_tcb_histent(te); } done: m_freem(m); return (0); } static void fill_tcp_info_from_tcb(struct adapter *sc, uint64_t *tcb, struct tcp_info *ti) { uint32_t v; ti->tcpi_state = GET_TCB_FIELD(tcb, T_STATE); v = GET_TCB_FIELD(tcb, T_SRTT); ti->tcpi_rtt = tcp_ticks_to_us(sc, v); v = GET_TCB_FIELD(tcb, T_RTTVAR); ti->tcpi_rttvar = tcp_ticks_to_us(sc, v); ti->tcpi_snd_ssthresh = GET_TCB_FIELD(tcb, SND_SSTHRESH); ti->tcpi_snd_cwnd = GET_TCB_FIELD(tcb, SND_CWND); ti->tcpi_rcv_nxt = GET_TCB_FIELD(tcb, RCV_NXT); v = GET_TCB_FIELD(tcb, TX_MAX); ti->tcpi_snd_nxt = v - GET_TCB_FIELD(tcb, SND_NXT_RAW); /* Receive window being advertised by us. */ ti->tcpi_rcv_wscale = GET_TCB_FIELD(tcb, SND_SCALE); /* Yes, SND. */ ti->tcpi_rcv_space = GET_TCB_FIELD(tcb, RCV_WND); /* Send window */ ti->tcpi_snd_wscale = GET_TCB_FIELD(tcb, RCV_SCALE); /* Yes, RCV. */ ti->tcpi_snd_wnd = GET_TCB_FIELD(tcb, RCV_ADV); if (get_tcb_tflags(tcb) & V_TF_RECV_SCALE(1)) ti->tcpi_snd_wnd <<= ti->tcpi_snd_wscale; else ti->tcpi_snd_wscale = 0; } static void fill_tcp_info_from_history(struct adapter *sc, struct tcb_histent *te, struct tcp_info *ti) { fill_tcp_info_from_tcb(sc, te->te_tcb, ti); } /* * Reads the TCB for the given tid using a memory window and copies it to 'buf' * in the same format as CPL_GET_TCB_RPL. */ static void read_tcb_using_memwin(struct adapter *sc, u_int tid, uint64_t *buf) { int i, j, k, rc; uint32_t addr; u_char *tcb, tmp; MPASS(tid < sc->tids.ntids); addr = t4_read_reg(sc, A_TP_CMM_TCB_BASE) + tid * TCB_SIZE; rc = read_via_memwin(sc, 2, addr, (uint32_t *)buf, TCB_SIZE); if (rc != 0) return; tcb = (u_char *)buf; for (i = 0, j = TCB_SIZE - 16; i < j; i += 16, j -= 16) { for (k = 0; k < 16; k++) { tmp = tcb[i + k]; tcb[i + k] = tcb[j + k]; tcb[j + k] = tmp; } } } static void fill_tcp_info(struct adapter *sc, u_int tid, struct tcp_info *ti) { uint64_t tcb[TCB_SIZE / sizeof(uint64_t)]; struct tcb_histent *te; ti->tcpi_toe_tid = tid; te = lookup_tcb_histent(sc, tid, false); if (te != NULL) { fill_tcp_info_from_history(sc, te, ti); release_tcb_histent(te); } else { if (!(sc->debug_flags & DF_DISABLE_TCB_CACHE)) { /* XXX: tell firmware to flush TCB cache. */ } read_tcb_using_memwin(sc, tid, tcb); fill_tcp_info_from_tcb(sc, tcb, ti); } } /* * Called by the kernel to allow the TOE driver to "refine" values filled up in * the tcp_info for an offloaded connection. */ static void t4_tcp_info(struct toedev *tod, struct tcpcb *tp, struct tcp_info *ti) { struct adapter *sc = tod->tod_softc; struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(tp->t_inpcb); MPASS(ti != NULL); fill_tcp_info(sc, toep->tid, ti); } #ifdef KERN_TLS static int t4_alloc_tls_session(struct toedev *tod, struct tcpcb *tp, struct ktls_session *tls, int direction) { struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(tp->t_inpcb); MPASS(tls != NULL); return (tls_alloc_ktls(toep, tls, direction)); } #endif /* * The TOE driver will not receive any more CPLs for the tid associated with the * toepcb; release the hold on the inpcb. */ void final_cpl_received(struct toepcb *toep) { struct inpcb *inp = toep->inp; + bool need_wakeup; KASSERT(inp != NULL, ("%s: inp is NULL", __func__)); INP_WLOCK_ASSERT(inp); KASSERT(toep->flags & TPF_CPL_PENDING, ("%s: CPL not pending already?", __func__)); CTR6(KTR_CXGBE, "%s: tid %d, toep %p (0x%x), inp %p (0x%x)", __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags); if (ulp_mode(toep) == ULP_MODE_TCPDDP) release_ddp_resources(toep); else if (ulp_mode(toep) == ULP_MODE_TLS) tls_detach(toep); toep->inp = NULL; - toep->flags &= ~TPF_CPL_PENDING; + need_wakeup = (toep->flags & TPF_WAITING_FOR_FINAL) != 0; + toep->flags &= ~(TPF_CPL_PENDING | TPF_WAITING_FOR_FINAL); mbufq_drain(&toep->ulp_pduq); mbufq_drain(&toep->ulp_pdu_reclaimq); if (!(toep->flags & TPF_ATTACHED)) release_offload_resources(toep); if (!in_pcbrele_wlocked(inp)) INP_WUNLOCK(inp); + + if (need_wakeup) { + struct mtx *lock = mtx_pool_find(mtxpool_sleep, toep); + + mtx_lock(lock); + wakeup(toep); + mtx_unlock(lock); + } } void insert_tid(struct adapter *sc, int tid, void *ctx, int ntids) { struct tid_info *t = &sc->tids; MPASS(tid >= t->tid_base); MPASS(tid - t->tid_base < t->ntids); t->tid_tab[tid - t->tid_base] = ctx; atomic_add_int(&t->tids_in_use, ntids); } void * lookup_tid(struct adapter *sc, int tid) { struct tid_info *t = &sc->tids; return (t->tid_tab[tid - t->tid_base]); } void update_tid(struct adapter *sc, int tid, void *ctx) { struct tid_info *t = &sc->tids; t->tid_tab[tid - t->tid_base] = ctx; } void remove_tid(struct adapter *sc, int tid, int ntids) { struct tid_info *t = &sc->tids; t->tid_tab[tid - t->tid_base] = NULL; atomic_subtract_int(&t->tids_in_use, ntids); } /* * What mtu_idx to use, given a 4-tuple. Note that both s->mss and tcp_mssopt * have the MSS that we should advertise in our SYN. Advertised MSS doesn't * account for any TCP options so the effective MSS (only payload, no headers or * options) could be different. */ static int find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, struct offload_settings *s) { unsigned short *mtus = &sc->params.mtus[0]; int i, mss, mtu; MPASS(inc != NULL); mss = s->mss > 0 ? s->mss : tcp_mssopt(inc); if (inc->inc_flags & INC_ISIPV6) mtu = mss + sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else mtu = mss + sizeof(struct ip) + sizeof(struct tcphdr); for (i = 0; i < NMTUS - 1 && mtus[i + 1] <= mtu; i++) continue; return (i); } /* * Determine the receive window size for a socket. */ u_long select_rcv_wnd(struct socket *so) { unsigned long wnd; SOCKBUF_LOCK_ASSERT(&so->so_rcv); wnd = sbspace(&so->so_rcv); if (wnd < MIN_RCV_WND) wnd = MIN_RCV_WND; return min(wnd, MAX_RCV_WND); } int select_rcv_wscale(void) { int wscale = 0; unsigned long space = sb_max; if (space > MAX_RCV_WND) space = MAX_RCV_WND; while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space) wscale++; return (wscale); } __be64 calc_options0(struct vi_info *vi, struct conn_params *cp) { uint64_t opt0 = 0; opt0 |= F_TCAM_BYPASS; MPASS(cp->wscale >= 0 && cp->wscale <= M_WND_SCALE); opt0 |= V_WND_SCALE(cp->wscale); MPASS(cp->mtu_idx >= 0 && cp->mtu_idx < NMTUS); opt0 |= V_MSS_IDX(cp->mtu_idx); MPASS(cp->ulp_mode >= 0 && cp->ulp_mode <= M_ULP_MODE); opt0 |= V_ULP_MODE(cp->ulp_mode); MPASS(cp->opt0_bufsize >= 0 && cp->opt0_bufsize <= M_RCV_BUFSIZ); opt0 |= V_RCV_BUFSIZ(cp->opt0_bufsize); MPASS(cp->l2t_idx >= 0 && cp->l2t_idx < vi->adapter->vres.l2t.size); opt0 |= V_L2T_IDX(cp->l2t_idx); opt0 |= V_SMAC_SEL(vi->smt_idx); opt0 |= V_TX_CHAN(vi->pi->tx_chan); MPASS(cp->keepalive == 0 || cp->keepalive == 1); opt0 |= V_KEEP_ALIVE(cp->keepalive); MPASS(cp->nagle == 0 || cp->nagle == 1); opt0 |= V_NAGLE(cp->nagle); return (htobe64(opt0)); } __be32 calc_options2(struct vi_info *vi, struct conn_params *cp) { uint32_t opt2 = 0; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; /* * rx flow control, rx coalesce, congestion control, and tx pace are all * explicitly set by the driver. On T5+ the ISS is also set by the * driver to the value picked by the kernel. */ if (is_t4(sc)) { opt2 |= F_RX_FC_VALID | F_RX_COALESCE_VALID; opt2 |= F_CONG_CNTRL_VALID | F_PACE_VALID; } else { opt2 |= F_T5_OPT_2_VALID; /* all 4 valid */ opt2 |= F_T5_ISS; /* ISS provided in CPL */ } MPASS(cp->sack == 0 || cp->sack == 1); opt2 |= V_SACK_EN(cp->sack); MPASS(cp->tstamp == 0 || cp->tstamp == 1); opt2 |= V_TSTAMPS_EN(cp->tstamp); if (cp->wscale > 0) opt2 |= F_WND_SCALE_EN; MPASS(cp->ecn == 0 || cp->ecn == 1); opt2 |= V_CCTRL_ECN(cp->ecn); /* XXX: F_RX_CHANNEL for multiple rx c-chan support goes here. */ opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]); opt2 |= V_PACE(0); opt2 |= F_RSS_QUEUE_VALID; opt2 |= V_RSS_QUEUE(sc->sge.ofld_rxq[cp->rxq_idx].iq.abs_id); MPASS(cp->cong_algo >= 0 && cp->cong_algo <= M_CONG_CNTRL); opt2 |= V_CONG_CNTRL(cp->cong_algo); MPASS(cp->rx_coalesce == 0 || cp->rx_coalesce == 1); if (cp->rx_coalesce == 1) opt2 |= V_RX_COALESCE(M_RX_COALESCE); opt2 |= V_RX_FC_DDP(0) | V_RX_FC_DISABLE(0); #ifdef USE_DDP_RX_FLOW_CONTROL if (cp->ulp_mode == ULP_MODE_TCPDDP) opt2 |= F_RX_FC_DDP; #endif return (htobe32(opt2)); } uint64_t select_ntuple(struct vi_info *vi, struct l2t_entry *e) { struct adapter *sc = vi->adapter; struct tp_params *tp = &sc->params.tp; uint64_t ntuple = 0; /* * Initialize each of the fields which we care about which are present * in the Compressed Filter Tuple. */ if (tp->vlan_shift >= 0 && EVL_VLANOFTAG(e->vlan) != CPL_L2T_VLAN_NONE) ntuple |= (uint64_t)(F_FT_VLAN_VLD | e->vlan) << tp->vlan_shift; if (tp->port_shift >= 0) ntuple |= (uint64_t)e->lport << tp->port_shift; if (tp->protocol_shift >= 0) ntuple |= (uint64_t)IPPROTO_TCP << tp->protocol_shift; if (tp->vnic_shift >= 0 && tp->vnic_mode == FW_VNIC_MODE_PF_VF) { ntuple |= (uint64_t)(V_FT_VNID_ID_VF(vi->vin) | V_FT_VNID_ID_PF(sc->pf) | V_FT_VNID_ID_VLD(vi->vfvld)) << tp->vnic_shift; } if (is_t4(sc)) return (htobe32((uint32_t)ntuple)); else return (htobe64(V_FILTER_TUPLE(ntuple))); } static int is_tls_sock(struct socket *so, struct adapter *sc) { struct inpcb *inp = sotoinpcb(so); int i, rc; /* XXX: Eventually add a SO_WANT_TLS socket option perhaps? */ rc = 0; ADAPTER_LOCK(sc); for (i = 0; i < sc->tt.num_tls_rx_ports; i++) { if (inp->inp_lport == htons(sc->tt.tls_rx_ports[i]) || inp->inp_fport == htons(sc->tt.tls_rx_ports[i])) { rc = 1; break; } } ADAPTER_UNLOCK(sc); return (rc); } /* * Initialize various connection parameters. */ void init_conn_params(struct vi_info *vi , struct offload_settings *s, struct in_conninfo *inc, struct socket *so, const struct tcp_options *tcpopt, int16_t l2t_idx, struct conn_params *cp) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct tom_tunables *tt = &sc->tt; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); u_long wnd; MPASS(s->offload != 0); /* Congestion control algorithm */ if (s->cong_algo >= 0) cp->cong_algo = s->cong_algo & M_CONG_CNTRL; else if (sc->tt.cong_algorithm >= 0) cp->cong_algo = tt->cong_algorithm & M_CONG_CNTRL; else { struct cc_algo *cc = CC_ALGO(tp); if (strcasecmp(cc->name, "reno") == 0) cp->cong_algo = CONG_ALG_RENO; else if (strcasecmp(cc->name, "tahoe") == 0) cp->cong_algo = CONG_ALG_TAHOE; if (strcasecmp(cc->name, "newreno") == 0) cp->cong_algo = CONG_ALG_NEWRENO; if (strcasecmp(cc->name, "highspeed") == 0) cp->cong_algo = CONG_ALG_HIGHSPEED; else { /* * Use newreno in case the algorithm selected by the * host stack is not supported by the hardware. */ cp->cong_algo = CONG_ALG_NEWRENO; } } /* Tx traffic scheduling class. */ if (s->sched_class >= 0 && s->sched_class < sc->params.nsched_cls) cp->tc_idx = s->sched_class; else cp->tc_idx = -1; /* Nagle's algorithm. */ if (s->nagle >= 0) cp->nagle = s->nagle > 0 ? 1 : 0; else cp->nagle = tp->t_flags & TF_NODELAY ? 0 : 1; /* TCP Keepalive. */ if (V_tcp_always_keepalive || so_options_get(so) & SO_KEEPALIVE) cp->keepalive = 1; else cp->keepalive = 0; /* Optimization that's specific to T5 @ 40G. */ if (tt->tx_align >= 0) cp->tx_align = tt->tx_align > 0 ? 1 : 0; else if (chip_id(sc) == CHELSIO_T5 && (port_top_speed(pi) > 10 || sc->params.nports > 2)) cp->tx_align = 1; else cp->tx_align = 0; /* ULP mode. */ if (can_tls_offload(sc) && (s->tls > 0 || (s->tls < 0 && is_tls_sock(so, sc)))) cp->ulp_mode = ULP_MODE_TLS; else if (s->ddp > 0 || (s->ddp < 0 && sc->tt.ddp && (so_options_get(so) & SO_NO_DDP) == 0)) cp->ulp_mode = ULP_MODE_TCPDDP; else cp->ulp_mode = ULP_MODE_NONE; /* Rx coalescing. */ if (s->rx_coalesce >= 0) cp->rx_coalesce = s->rx_coalesce > 0 ? 1 : 0; else if (cp->ulp_mode == ULP_MODE_TLS) cp->rx_coalesce = 0; else if (tt->rx_coalesce >= 0) cp->rx_coalesce = tt->rx_coalesce > 0 ? 1 : 0; else cp->rx_coalesce = 1; /* default */ /* * Index in the PMTU table. This controls the MSS that we announce in * our SYN initially, but after ESTABLISHED it controls the MSS that we * use to send data. */ cp->mtu_idx = find_best_mtu_idx(sc, inc, s); /* Tx queue for this connection. */ if (s->txq >= 0 && s->txq < vi->nofldtxq) cp->txq_idx = s->txq; else cp->txq_idx = arc4random() % vi->nofldtxq; cp->txq_idx += vi->first_ofld_txq; /* Rx queue for this connection. */ if (s->rxq >= 0 && s->rxq < vi->nofldrxq) cp->rxq_idx = s->rxq; else cp->rxq_idx = arc4random() % vi->nofldrxq; cp->rxq_idx += vi->first_ofld_rxq; if (SOLISTENING(so)) { /* Passive open */ MPASS(tcpopt != NULL); /* TCP timestamp option */ if (tcpopt->tstamp && (s->tstamp > 0 || (s->tstamp < 0 && V_tcp_do_rfc1323))) cp->tstamp = 1; else cp->tstamp = 0; /* SACK */ if (tcpopt->sack && (s->sack > 0 || (s->sack < 0 && V_tcp_do_sack))) cp->sack = 1; else cp->sack = 0; /* Receive window scaling. */ if (tcpopt->wsf > 0 && tcpopt->wsf < 15 && V_tcp_do_rfc1323) cp->wscale = select_rcv_wscale(); else cp->wscale = 0; /* ECN */ if (tcpopt->ecn && /* XXX: review. */ (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn))) cp->ecn = 1; else cp->ecn = 0; wnd = max(so->sol_sbrcv_hiwat, MIN_RCV_WND); cp->opt0_bufsize = min(wnd >> 10, M_RCV_BUFSIZ); if (tt->sndbuf > 0) cp->sndbuf = tt->sndbuf; else if (so->sol_sbsnd_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf) cp->sndbuf = 256 * 1024; else cp->sndbuf = so->sol_sbsnd_hiwat; } else { /* Active open */ /* TCP timestamp option */ if (s->tstamp > 0 || (s->tstamp < 0 && (tp->t_flags & TF_REQ_TSTMP))) cp->tstamp = 1; else cp->tstamp = 0; /* SACK */ if (s->sack > 0 || (s->sack < 0 && (tp->t_flags & TF_SACK_PERMIT))) cp->sack = 1; else cp->sack = 0; /* Receive window scaling */ if (tp->t_flags & TF_REQ_SCALE) cp->wscale = select_rcv_wscale(); else cp->wscale = 0; /* ECN */ if (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn == 1)) cp->ecn = 1; else cp->ecn = 0; SOCKBUF_LOCK(&so->so_rcv); wnd = max(select_rcv_wnd(so), MIN_RCV_WND); SOCKBUF_UNLOCK(&so->so_rcv); cp->opt0_bufsize = min(wnd >> 10, M_RCV_BUFSIZ); if (tt->sndbuf > 0) cp->sndbuf = tt->sndbuf; else { SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf) cp->sndbuf = 256 * 1024; else cp->sndbuf = so->so_snd.sb_hiwat; SOCKBUF_UNLOCK(&so->so_snd); } } cp->l2t_idx = l2t_idx; /* This will be initialized on ESTABLISHED. */ cp->emss = 0; } int negative_advice(int status) { return (status == CPL_ERR_RTX_NEG_ADVICE || status == CPL_ERR_PERSIST_NEG_ADVICE || status == CPL_ERR_KEEPALV_NEG_ADVICE); } static int alloc_tid_tab(struct tid_info *t, int flags) { MPASS(t->ntids > 0); MPASS(t->tid_tab == NULL); t->tid_tab = malloc(t->ntids * sizeof(*t->tid_tab), M_CXGBE, M_ZERO | flags); if (t->tid_tab == NULL) return (ENOMEM); atomic_store_rel_int(&t->tids_in_use, 0); return (0); } static void free_tid_tab(struct tid_info *t) { KASSERT(t->tids_in_use == 0, ("%s: %d tids still in use.", __func__, t->tids_in_use)); free(t->tid_tab, M_CXGBE); t->tid_tab = NULL; } static int alloc_stid_tab(struct tid_info *t, int flags) { MPASS(t->nstids > 0); MPASS(t->stid_tab == NULL); t->stid_tab = malloc(t->nstids * sizeof(*t->stid_tab), M_CXGBE, M_ZERO | flags); if (t->stid_tab == NULL) return (ENOMEM); mtx_init(&t->stid_lock, "stid lock", NULL, MTX_DEF); t->stids_in_use = 0; TAILQ_INIT(&t->stids); t->nstids_free_head = t->nstids; return (0); } static void free_stid_tab(struct tid_info *t) { KASSERT(t->stids_in_use == 0, ("%s: %d tids still in use.", __func__, t->stids_in_use)); if (mtx_initialized(&t->stid_lock)) mtx_destroy(&t->stid_lock); free(t->stid_tab, M_CXGBE); t->stid_tab = NULL; } static void free_tid_tabs(struct tid_info *t) { free_tid_tab(t); free_stid_tab(t); } static int alloc_tid_tabs(struct tid_info *t) { int rc; rc = alloc_tid_tab(t, M_NOWAIT); if (rc != 0) goto failed; rc = alloc_stid_tab(t, M_NOWAIT); if (rc != 0) goto failed; return (0); failed: free_tid_tabs(t); return (rc); } static inline void alloc_tcb_history(struct adapter *sc, struct tom_data *td) { if (sc->tids.ntids == 0 || sc->tids.ntids > 1024) return; rw_init(&td->tcb_history_lock, "TCB history"); td->tcb_history = malloc(sc->tids.ntids * sizeof(*td->tcb_history), M_CXGBE, M_ZERO | M_NOWAIT); td->dupack_threshold = G_DUPACKTHRESH(t4_read_reg(sc, A_TP_PARA_REG0)); } static inline void free_tcb_history(struct adapter *sc, struct tom_data *td) { #ifdef INVARIANTS int i; if (td->tcb_history != NULL) { for (i = 0; i < sc->tids.ntids; i++) { MPASS(td->tcb_history[i] == NULL); } } #endif free(td->tcb_history, M_CXGBE); if (rw_initialized(&td->tcb_history_lock)) rw_destroy(&td->tcb_history_lock); } static void free_tom_data(struct adapter *sc, struct tom_data *td) { ASSERT_SYNCHRONIZED_OP(sc); KASSERT(TAILQ_EMPTY(&td->toep_list), ("%s: TOE PCB list is not empty.", __func__)); KASSERT(td->lctx_count == 0, ("%s: lctx hash table is not empty.", __func__)); t4_free_ppod_region(&td->pr); if (td->listen_mask != 0) hashdestroy(td->listen_hash, M_CXGBE, td->listen_mask); if (mtx_initialized(&td->unsent_wr_lock)) mtx_destroy(&td->unsent_wr_lock); if (mtx_initialized(&td->lctx_hash_lock)) mtx_destroy(&td->lctx_hash_lock); if (mtx_initialized(&td->toep_list_lock)) mtx_destroy(&td->toep_list_lock); free_tcb_history(sc, td); free_tid_tabs(&sc->tids); free(td, M_CXGBE); } static char * prepare_pkt(int open_type, uint16_t vtag, struct inpcb *inp, int *pktlen, int *buflen) { char *pkt; struct tcphdr *th; int ipv6, len; const int maxlen = max(sizeof(struct ether_header), sizeof(struct ether_vlan_header)) + max(sizeof(struct ip), sizeof(struct ip6_hdr)) + sizeof(struct tcphdr); MPASS(open_type == OPEN_TYPE_ACTIVE || open_type == OPEN_TYPE_LISTEN); pkt = malloc(maxlen, M_CXGBE, M_ZERO | M_NOWAIT); if (pkt == NULL) return (NULL); ipv6 = inp->inp_vflag & INP_IPV6; len = 0; if (EVL_VLANOFTAG(vtag) == 0xfff) { struct ether_header *eh = (void *)pkt; if (ipv6) eh->ether_type = htons(ETHERTYPE_IPV6); else eh->ether_type = htons(ETHERTYPE_IP); len += sizeof(*eh); } else { struct ether_vlan_header *evh = (void *)pkt; evh->evl_encap_proto = htons(ETHERTYPE_VLAN); evh->evl_tag = htons(vtag); if (ipv6) evh->evl_proto = htons(ETHERTYPE_IPV6); else evh->evl_proto = htons(ETHERTYPE_IP); len += sizeof(*evh); } if (ipv6) { struct ip6_hdr *ip6 = (void *)&pkt[len]; ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_plen = htons(sizeof(struct tcphdr)); ip6->ip6_nxt = IPPROTO_TCP; if (open_type == OPEN_TYPE_ACTIVE) { ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = inp->in6p_faddr; } else if (open_type == OPEN_TYPE_LISTEN) { ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = ip6->ip6_src; } len += sizeof(*ip6); } else { struct ip *ip = (void *)&pkt[len]; ip->ip_v = IPVERSION; ip->ip_hl = sizeof(*ip) >> 2; ip->ip_tos = inp->inp_ip_tos; ip->ip_len = htons(sizeof(struct ip) + sizeof(struct tcphdr)); ip->ip_ttl = inp->inp_ip_ttl; ip->ip_p = IPPROTO_TCP; if (open_type == OPEN_TYPE_ACTIVE) { ip->ip_src = inp->inp_laddr; ip->ip_dst = inp->inp_faddr; } else if (open_type == OPEN_TYPE_LISTEN) { ip->ip_src = inp->inp_laddr; ip->ip_dst = ip->ip_src; } len += sizeof(*ip); } th = (void *)&pkt[len]; if (open_type == OPEN_TYPE_ACTIVE) { th->th_sport = inp->inp_lport; /* network byte order already */ th->th_dport = inp->inp_fport; /* ditto */ } else if (open_type == OPEN_TYPE_LISTEN) { th->th_sport = inp->inp_lport; /* network byte order already */ th->th_dport = th->th_sport; } len += sizeof(th); *pktlen = *buflen = len; return (pkt); } const struct offload_settings * lookup_offload_policy(struct adapter *sc, int open_type, struct mbuf *m, uint16_t vtag, struct inpcb *inp) { const struct t4_offload_policy *op; char *pkt; struct offload_rule *r; int i, matched, pktlen, buflen; static const struct offload_settings allow_offloading_settings = { .offload = 1, .rx_coalesce = -1, .cong_algo = -1, .sched_class = -1, .tstamp = -1, .sack = -1, .nagle = -1, .ecn = -1, .ddp = -1, .tls = -1, .txq = -1, .rxq = -1, .mss = -1, }; static const struct offload_settings disallow_offloading_settings = { .offload = 0, /* rest is irrelevant when offload is off. */ }; rw_assert(&sc->policy_lock, RA_LOCKED); /* * If there's no Connection Offloading Policy attached to the device * then we need to return a default static policy. If * "cop_managed_offloading" is true, then we need to disallow * offloading until a COP is attached to the device. Otherwise we * allow offloading ... */ op = sc->policy; if (op == NULL) { if (sc->tt.cop_managed_offloading) return (&disallow_offloading_settings); else return (&allow_offloading_settings); } switch (open_type) { case OPEN_TYPE_ACTIVE: case OPEN_TYPE_LISTEN: pkt = prepare_pkt(open_type, vtag, inp, &pktlen, &buflen); break; case OPEN_TYPE_PASSIVE: MPASS(m != NULL); pkt = mtod(m, char *); MPASS(*pkt == CPL_PASS_ACCEPT_REQ); pkt += sizeof(struct cpl_pass_accept_req); pktlen = m->m_pkthdr.len - sizeof(struct cpl_pass_accept_req); buflen = m->m_len - sizeof(struct cpl_pass_accept_req); break; default: MPASS(0); return (&disallow_offloading_settings); } if (pkt == NULL || pktlen == 0 || buflen == 0) return (&disallow_offloading_settings); matched = 0; r = &op->rule[0]; for (i = 0; i < op->nrules; i++, r++) { if (r->open_type != open_type && r->open_type != OPEN_TYPE_DONTCARE) { continue; } matched = bpf_filter(r->bpf_prog.bf_insns, pkt, pktlen, buflen); if (matched) break; } if (open_type == OPEN_TYPE_ACTIVE || open_type == OPEN_TYPE_LISTEN) free(pkt, M_CXGBE); return (matched ? &r->settings : &disallow_offloading_settings); } static void reclaim_wr_resources(void *arg, int count) { struct tom_data *td = arg; STAILQ_HEAD(, wrqe) twr_list = STAILQ_HEAD_INITIALIZER(twr_list); struct cpl_act_open_req *cpl; u_int opcode, atid, tid; struct wrqe *wr; struct adapter *sc = td_adapter(td); mtx_lock(&td->unsent_wr_lock); STAILQ_SWAP(&td->unsent_wr_list, &twr_list, wrqe); mtx_unlock(&td->unsent_wr_lock); while ((wr = STAILQ_FIRST(&twr_list)) != NULL) { STAILQ_REMOVE_HEAD(&twr_list, link); cpl = wrtod(wr); opcode = GET_OPCODE(cpl); switch (opcode) { case CPL_ACT_OPEN_REQ: case CPL_ACT_OPEN_REQ6: atid = G_TID_TID(be32toh(OPCODE_TID(cpl))); CTR2(KTR_CXGBE, "%s: atid %u ", __func__, atid); act_open_failure_cleanup(sc, atid, EHOSTUNREACH); free(wr, M_CXGBE); break; case CPL_PASS_ACCEPT_RPL: tid = GET_TID(cpl); CTR2(KTR_CXGBE, "%s: tid %u ", __func__, tid); synack_failure_cleanup(sc, tid); free(wr, M_CXGBE); break; default: log(LOG_ERR, "%s: leaked work request %p, wr_len %d, " "opcode %x\n", __func__, wr, wr->wr_len, opcode); /* WR not freed here; go look at it with a debugger. */ } } } /* * Ground control to Major TOM * Commencing countdown, engines on */ static int t4_tom_activate(struct adapter *sc) { struct tom_data *td; struct toedev *tod; struct vi_info *vi; int i, rc, v; ASSERT_SYNCHRONIZED_OP(sc); /* per-adapter softc for TOM */ td = malloc(sizeof(*td), M_CXGBE, M_ZERO | M_NOWAIT); if (td == NULL) return (ENOMEM); /* List of TOE PCBs and associated lock */ mtx_init(&td->toep_list_lock, "PCB list lock", NULL, MTX_DEF); TAILQ_INIT(&td->toep_list); /* Listen context */ mtx_init(&td->lctx_hash_lock, "lctx hash lock", NULL, MTX_DEF); td->listen_hash = hashinit_flags(LISTEN_HASH_SIZE, M_CXGBE, &td->listen_mask, HASH_NOWAIT); /* List of WRs for which L2 resolution failed */ mtx_init(&td->unsent_wr_lock, "Unsent WR list lock", NULL, MTX_DEF); STAILQ_INIT(&td->unsent_wr_list); TASK_INIT(&td->reclaim_wr_resources, 0, reclaim_wr_resources, td); /* TID tables */ rc = alloc_tid_tabs(&sc->tids); if (rc != 0) goto done; rc = t4_init_ppod_region(&td->pr, &sc->vres.ddp, t4_read_reg(sc, A_ULP_RX_TDDP_PSZ), "TDDP page pods"); if (rc != 0) goto done; t4_set_reg_field(sc, A_ULP_RX_TDDP_TAGMASK, V_TDDPTAGMASK(M_TDDPTAGMASK), td->pr.pr_tag_mask); alloc_tcb_history(sc, td); /* toedev ops */ tod = &td->tod; init_toedev(tod); tod->tod_softc = sc; tod->tod_connect = t4_connect; tod->tod_listen_start = t4_listen_start; tod->tod_listen_stop = t4_listen_stop; tod->tod_rcvd = t4_rcvd; tod->tod_output = t4_tod_output; tod->tod_send_rst = t4_send_rst; tod->tod_send_fin = t4_send_fin; tod->tod_pcb_detach = t4_pcb_detach; tod->tod_l2_update = t4_l2_update; tod->tod_syncache_added = t4_syncache_added; tod->tod_syncache_removed = t4_syncache_removed; tod->tod_syncache_respond = t4_syncache_respond; tod->tod_offload_socket = t4_offload_socket; tod->tod_ctloutput = t4_ctloutput; tod->tod_tcp_info = t4_tcp_info; #ifdef KERN_TLS tod->tod_alloc_tls_session = t4_alloc_tls_session; #endif for_each_port(sc, i) { for_each_vi(sc->port[i], v, vi) { TOEDEV(vi->ifp) = &td->tod; } } sc->tom_softc = td; register_toedev(sc->tom_softc); done: if (rc != 0) free_tom_data(sc, td); return (rc); } static int t4_tom_deactivate(struct adapter *sc) { int rc = 0; struct tom_data *td = sc->tom_softc; ASSERT_SYNCHRONIZED_OP(sc); if (td == NULL) return (0); /* XXX. KASSERT? */ if (sc->offload_map != 0) return (EBUSY); /* at least one port has IFCAP_TOE enabled */ if (uld_active(sc, ULD_IWARP) || uld_active(sc, ULD_ISCSI)) return (EBUSY); /* both iWARP and iSCSI rely on the TOE. */ mtx_lock(&td->toep_list_lock); if (!TAILQ_EMPTY(&td->toep_list)) rc = EBUSY; mtx_unlock(&td->toep_list_lock); mtx_lock(&td->lctx_hash_lock); if (td->lctx_count > 0) rc = EBUSY; mtx_unlock(&td->lctx_hash_lock); taskqueue_drain(taskqueue_thread, &td->reclaim_wr_resources); mtx_lock(&td->unsent_wr_lock); if (!STAILQ_EMPTY(&td->unsent_wr_list)) rc = EBUSY; mtx_unlock(&td->unsent_wr_lock); if (rc == 0) { unregister_toedev(sc->tom_softc); free_tom_data(sc, td); sc->tom_softc = NULL; } return (rc); } static int t4_aio_queue_tom(struct socket *so, struct kaiocb *job) { struct tcpcb *tp = so_sototcpcb(so); struct toepcb *toep = tp->t_toe; int error; if (ulp_mode(toep) == ULP_MODE_TCPDDP) { error = t4_aio_queue_ddp(so, job); if (error != EOPNOTSUPP) return (error); } return (t4_aio_queue_aiotx(so, job)); } static int t4_ctloutput_tom(struct socket *so, struct sockopt *sopt) { if (sopt->sopt_level != IPPROTO_TCP) return (tcp_ctloutput(so, sopt)); switch (sopt->sopt_name) { case TCP_TLSOM_SET_TLS_CONTEXT: case TCP_TLSOM_GET_TLS_TOM: case TCP_TLSOM_CLR_TLS_TOM: case TCP_TLSOM_CLR_QUIES: return (t4_ctloutput_tls(so, sopt)); default: return (tcp_ctloutput(so, sopt)); } } static int t4_tom_mod_load(void) { /* CPL handlers */ t4_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl); t4_register_shared_cpl_handler(CPL_L2T_WRITE_RPL, do_l2t_write_rpl2, CPL_COOKIE_TOM); t4_init_connect_cpl_handlers(); t4_init_listen_cpl_handlers(); t4_init_cpl_io_handlers(); t4_ddp_mod_load(); t4_tls_mod_load(); tcp_protosw = pffindproto(PF_INET, IPPROTO_TCP, SOCK_STREAM); if (tcp_protosw == NULL) return (ENOPROTOOPT); bcopy(tcp_protosw, &toe_protosw, sizeof(toe_protosw)); bcopy(tcp_protosw->pr_usrreqs, &toe_usrreqs, sizeof(toe_usrreqs)); toe_usrreqs.pru_aio_queue = t4_aio_queue_tom; toe_protosw.pr_ctloutput = t4_ctloutput_tom; toe_protosw.pr_usrreqs = &toe_usrreqs; tcp6_protosw = pffindproto(PF_INET6, IPPROTO_TCP, SOCK_STREAM); if (tcp6_protosw == NULL) return (ENOPROTOOPT); bcopy(tcp6_protosw, &toe6_protosw, sizeof(toe6_protosw)); bcopy(tcp6_protosw->pr_usrreqs, &toe6_usrreqs, sizeof(toe6_usrreqs)); toe6_usrreqs.pru_aio_queue = t4_aio_queue_tom; toe6_protosw.pr_ctloutput = t4_ctloutput_tom; toe6_protosw.pr_usrreqs = &toe6_usrreqs; return (t4_register_uld(&tom_uld_info)); } static void tom_uninit(struct adapter *sc, void *arg __unused) { if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4tomun")) return; /* Try to free resources (works only if no port has IFCAP_TOE) */ if (uld_active(sc, ULD_TOM)) t4_deactivate_uld(sc, ULD_TOM); end_synchronized_op(sc, 0); } static int t4_tom_mod_unload(void) { t4_iterate(tom_uninit, NULL); if (t4_unregister_uld(&tom_uld_info) == EBUSY) return (EBUSY); t4_tls_mod_unload(); t4_ddp_mod_unload(); t4_uninit_connect_cpl_handlers(); t4_uninit_listen_cpl_handlers(); t4_uninit_cpl_io_handlers(); t4_register_shared_cpl_handler(CPL_L2T_WRITE_RPL, NULL, CPL_COOKIE_TOM); t4_register_cpl_handler(CPL_GET_TCB_RPL, NULL); return (0); } #endif /* TCP_OFFLOAD */ static int t4_tom_modevent(module_t mod, int cmd, void *arg) { int rc = 0; #ifdef TCP_OFFLOAD switch (cmd) { case MOD_LOAD: rc = t4_tom_mod_load(); break; case MOD_UNLOAD: rc = t4_tom_mod_unload(); break; default: rc = EINVAL; } #else printf("t4_tom: compiled without TCP_OFFLOAD support.\n"); rc = EOPNOTSUPP; #endif return (rc); } static moduledata_t t4_tom_moddata= { "t4_tom", t4_tom_modevent, 0 }; MODULE_VERSION(t4_tom, 1); MODULE_DEPEND(t4_tom, toecore, 1, 1, 1); MODULE_DEPEND(t4_tom, t4nex, 1, 1, 1); DECLARE_MODULE(t4_tom, t4_tom_moddata, SI_SUB_EXEC, SI_ORDER_ANY); diff --git a/sys/dev/cxgbe/tom/t4_tom.h b/sys/dev/cxgbe/tom/t4_tom.h index 89c38f8c988a..2224ece20c38 100644 --- a/sys/dev/cxgbe/tom/t4_tom.h +++ b/sys/dev/cxgbe/tom/t4_tom.h @@ -1,535 +1,536 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012, 2015 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ * */ #ifndef __T4_TOM_H__ #define __T4_TOM_H__ #include #include "common/t4_hw.h" #include "common/t4_msg.h" #include "tom/t4_tls.h" #define LISTEN_HASH_SIZE 32 /* * Min receive window. We want it to be large enough to accommodate receive * coalescing, handle jumbo frames, and not trigger sender SWS avoidance. */ #define MIN_RCV_WND (24 * 1024U) /* * Max receive window supported by HW in bytes. Only a small part of it can * be set through option0, the rest needs to be set through RX_DATA_ACK. */ #define MAX_RCV_WND ((1U << 27) - 1) #define DDP_RSVD_WIN (16 * 1024U) #define SB_DDP_INDICATE SB_IN_TOE /* soreceive must respond to indicate */ #define USE_DDP_RX_FLOW_CONTROL #define PPOD_SZ(n) ((n) * sizeof(struct pagepod)) #define PPOD_SIZE (PPOD_SZ(1)) /* TOE PCB flags */ enum { TPF_ATTACHED = (1 << 0), /* a tcpcb refers to this toepcb */ TPF_FLOWC_WR_SENT = (1 << 1), /* firmware flow context WR sent */ TPF_TX_DATA_SENT = (1 << 2), /* some data sent */ TPF_TX_SUSPENDED = (1 << 3), /* tx suspended for lack of resources */ TPF_SEND_FIN = (1 << 4), /* send FIN after all pending data */ TPF_FIN_SENT = (1 << 5), /* FIN has been sent */ TPF_ABORT_SHUTDOWN = (1 << 6), /* connection abort is in progress */ TPF_CPL_PENDING = (1 << 7), /* haven't received the last CPL */ TPF_SYNQE = (1 << 8), /* synq_entry, not really a toepcb */ TPF_SYNQE_EXPANDED = (1 << 9), /* toepcb ready, tid context updated */ TPF_FORCE_CREDITS = (1 << 10), /* always send credits */ TPF_KTLS = (1 << 11), /* send TLS records from KTLS */ TPF_INITIALIZED = (1 << 12), /* init_toepcb has been called */ TPF_TLS_RECEIVE = (1 << 13), /* should receive TLS records */ TPF_TLS_ESTABLISHED = (1 << 14), /* TLS handshake timer initialized */ + TPF_WAITING_FOR_FINAL = (1<< 15), /* waiting for wakeup on final CPL */ }; enum { DDP_OK = (1 << 0), /* OK to turn on DDP */ DDP_SC_REQ = (1 << 1), /* state change (on/off) requested */ DDP_ON = (1 << 2), /* DDP is turned on */ DDP_BUF0_ACTIVE = (1 << 3), /* buffer 0 in use (not invalidated) */ DDP_BUF1_ACTIVE = (1 << 4), /* buffer 1 in use (not invalidated) */ DDP_TASK_ACTIVE = (1 << 5), /* requeue task is queued / running */ DDP_DEAD = (1 << 6), /* toepcb is shutting down */ }; struct ctl_sg_entry; struct sockopt; struct offload_settings; /* * Connection parameters for an offloaded connection. These are mostly (but not * all) hardware TOE parameters. */ struct conn_params { int8_t rx_coalesce; int8_t cong_algo; int8_t tc_idx; int8_t tstamp; int8_t sack; int8_t nagle; int8_t keepalive; int8_t wscale; int8_t ecn; int8_t mtu_idx; int8_t ulp_mode; int8_t tx_align; int16_t txq_idx; /* ofld_txq = &sc->sge.ofld_txq[txq_idx] */ int16_t rxq_idx; /* ofld_rxq = &sc->sge.ofld_rxq[rxq_idx] */ int16_t l2t_idx; uint16_t emss; uint16_t opt0_bufsize; u_int sndbuf; /* controls TP tx pages */ }; struct ofld_tx_sdesc { uint32_t plen; /* payload length */ uint8_t tx_credits; /* firmware tx credits (unit is 16B) */ void *iv_buffer; /* optional buffer holding IVs for TLS */ }; struct ppod_region { u_int pr_start; u_int pr_len; u_int pr_page_shift[4]; uint32_t pr_tag_mask; /* hardware tagmask for this region. */ uint32_t pr_invalid_bit; /* OR with this to invalidate tag. */ uint32_t pr_alias_mask; /* AND with tag to get alias bits. */ u_int pr_alias_shift; /* shift this much for first alias bit. */ vmem_t *pr_arena; }; struct ppod_reservation { struct ppod_region *prsv_pr; uint32_t prsv_tag; /* Full tag: pgsz, alias, tag, color */ u_int prsv_nppods; }; struct pageset { TAILQ_ENTRY(pageset) link; vm_page_t *pages; int npages; int flags; int offset; /* offset in first page */ int len; struct ppod_reservation prsv; struct vmspace *vm; vm_offset_t start; u_int vm_timestamp; }; TAILQ_HEAD(pagesetq, pageset); #define PS_PPODS_WRITTEN 0x0001 /* Page pods written to the card. */ struct ddp_buffer { struct pageset *ps; struct kaiocb *job; int cancel_pending; }; struct ddp_pcb { u_int flags; struct ddp_buffer db[2]; TAILQ_HEAD(, pageset) cached_pagesets; TAILQ_HEAD(, kaiocb) aiojobq; u_int waiting_count; u_int active_count; u_int cached_count; int active_id; /* the currently active DDP buffer */ struct task requeue_task; struct kaiocb *queueing; struct mtx lock; }; struct toepcb { struct tom_data *td; struct inpcb *inp; /* backpointer to host stack's PCB */ u_int flags; /* miscellaneous flags */ TAILQ_ENTRY(toepcb) link; /* toep_list */ int refcount; struct vnet *vnet; struct vi_info *vi; /* virtual interface */ struct sge_ofld_txq *ofld_txq; struct sge_ofld_rxq *ofld_rxq; struct sge_wrq *ctrlq; struct l2t_entry *l2te; /* L2 table entry used by this connection */ struct clip_entry *ce; /* CLIP table entry used by this tid */ int tid; /* Connection identifier */ /* tx credit handling */ u_int tx_total; /* total tx WR credits (in 16B units) */ u_int tx_credits; /* tx WR credits (in 16B units) available */ u_int tx_nocompl; /* tx WR credits since last compl request */ u_int plen_nocompl; /* payload since last compl request */ struct conn_params params; void *ulpcb; void *ulpcb2; struct mbufq ulp_pduq; /* PDUs waiting to be sent out. */ struct mbufq ulp_pdu_reclaimq; struct ddp_pcb ddp; struct tls_ofld_info tls; TAILQ_HEAD(, kaiocb) aiotx_jobq; struct task aiotx_task; struct socket *aiotx_so; /* Tx software descriptor */ uint8_t txsd_total; uint8_t txsd_pidx; uint8_t txsd_cidx; uint8_t txsd_avail; struct ofld_tx_sdesc txsd[]; }; static inline int ulp_mode(struct toepcb *toep) { return (toep->params.ulp_mode); } #define DDP_LOCK(toep) mtx_lock(&(toep)->ddp.lock) #define DDP_UNLOCK(toep) mtx_unlock(&(toep)->ddp.lock) #define DDP_ASSERT_LOCKED(toep) mtx_assert(&(toep)->ddp.lock, MA_OWNED) /* * Compressed state for embryonic connections for a listener. */ struct synq_entry { struct listen_ctx *lctx; /* backpointer to listen ctx */ struct mbuf *syn; int flags; /* same as toepcb's tp_flags */ volatile int ok_to_respond; volatile u_int refcnt; int tid; uint32_t iss; uint32_t irs; uint32_t ts; uint32_t rss_hash; __be16 tcp_opt; /* from cpl_pass_establish */ struct toepcb *toep; struct conn_params params; }; /* listen_ctx flags */ #define LCTX_RPL_PENDING 1 /* waiting for a CPL_PASS_OPEN_RPL */ struct listen_ctx { LIST_ENTRY(listen_ctx) link; /* listen hash linkage */ volatile int refcount; int stid; struct stid_region stid_region; int flags; struct inpcb *inp; /* listening socket's inp */ struct vnet *vnet; struct sge_wrq *ctrlq; struct sge_ofld_rxq *ofld_rxq; struct clip_entry *ce; }; /* tcb_histent flags */ #define TE_RPL_PENDING 1 #define TE_ACTIVE 2 /* bits in one 8b tcb_histent sample. */ #define TS_RTO (1 << 0) #define TS_DUPACKS (1 << 1) #define TS_FASTREXMT (1 << 2) #define TS_SND_BACKLOGGED (1 << 3) #define TS_CWND_LIMITED (1 << 4) #define TS_ECN_ECE (1 << 5) #define TS_ECN_CWR (1 << 6) #define TS_RESERVED (1 << 7) /* Unused. */ struct tcb_histent { struct mtx te_lock; struct callout te_callout; uint64_t te_tcb[TCB_SIZE / sizeof(uint64_t)]; struct adapter *te_adapter; u_int te_flags; u_int te_tid; uint8_t te_pidx; uint8_t te_sample[100]; }; struct tom_data { struct toedev tod; /* toepcb's associated with this TOE device */ struct mtx toep_list_lock; TAILQ_HEAD(, toepcb) toep_list; struct mtx lctx_hash_lock; LIST_HEAD(, listen_ctx) *listen_hash; u_long listen_mask; int lctx_count; /* # of lctx in the hash table */ struct ppod_region pr; struct rwlock tcb_history_lock __aligned(CACHE_LINE_SIZE); struct tcb_histent **tcb_history; int dupack_threshold; /* WRs that will not be sent to the chip because L2 resolution failed */ struct mtx unsent_wr_lock; STAILQ_HEAD(, wrqe) unsent_wr_list; struct task reclaim_wr_resources; }; static inline struct tom_data * tod_td(struct toedev *tod) { return (__containerof(tod, struct tom_data, tod)); } static inline struct adapter * td_adapter(struct tom_data *td) { return (td->tod.tod_softc); } static inline void set_mbuf_raw_wr(struct mbuf *m, bool raw) { M_ASSERTPKTHDR(m); m->m_pkthdr.PH_per.eight[6] = raw; } static inline bool mbuf_raw_wr(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_pkthdr.PH_per.eight[6]); } static inline void set_mbuf_ulp_submode(struct mbuf *m, uint8_t ulp_submode) { M_ASSERTPKTHDR(m); m->m_pkthdr.PH_per.eight[0] = ulp_submode; } static inline uint8_t mbuf_ulp_submode(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_pkthdr.PH_per.eight[0]); } static inline void set_mbuf_iscsi_iso(struct mbuf *m, bool iso) { M_ASSERTPKTHDR(m); m->m_pkthdr.PH_per.eight[1] = iso; } static inline bool mbuf_iscsi_iso(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_pkthdr.PH_per.eight[1]); } /* Flags for iSCSI segmentation offload. */ #define CXGBE_ISO_TYPE(flags) ((flags) & 0x3) #define CXGBE_ISO_F 0x4 static inline void set_mbuf_iscsi_iso_flags(struct mbuf *m, uint8_t flags) { M_ASSERTPKTHDR(m); m->m_pkthdr.PH_per.eight[2] = flags; } static inline uint8_t mbuf_iscsi_iso_flags(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_pkthdr.PH_per.eight[2]); } static inline void set_mbuf_iscsi_iso_mss(struct mbuf *m, uint16_t mss) { M_ASSERTPKTHDR(m); m->m_pkthdr.PH_per.sixteen[2] = mss; } static inline uint16_t mbuf_iscsi_iso_mss(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_pkthdr.PH_per.sixteen[2]); } /* t4_tom.c */ struct toepcb *alloc_toepcb(struct vi_info *, int); int init_toepcb(struct vi_info *, struct toepcb *); struct toepcb *hold_toepcb(struct toepcb *); void free_toepcb(struct toepcb *); void offload_socket(struct socket *, struct toepcb *); void restore_so_proto(struct socket *, bool); void undo_offload_socket(struct socket *); void final_cpl_received(struct toepcb *); void insert_tid(struct adapter *, int, void *, int); void *lookup_tid(struct adapter *, int); void update_tid(struct adapter *, int, void *); void remove_tid(struct adapter *, int, int); u_long select_rcv_wnd(struct socket *); int select_rcv_wscale(void); void init_conn_params(struct vi_info *, struct offload_settings *, struct in_conninfo *, struct socket *, const struct tcp_options *, int16_t, struct conn_params *cp); __be64 calc_options0(struct vi_info *, struct conn_params *); __be32 calc_options2(struct vi_info *, struct conn_params *); uint64_t select_ntuple(struct vi_info *, struct l2t_entry *); int negative_advice(int); int add_tid_to_history(struct adapter *, u_int); /* t4_connect.c */ void t4_init_connect_cpl_handlers(void); void t4_uninit_connect_cpl_handlers(void); int t4_connect(struct toedev *, struct socket *, struct nhop_object *, struct sockaddr *); void act_open_failure_cleanup(struct adapter *, u_int, u_int); /* t4_listen.c */ void t4_init_listen_cpl_handlers(void); void t4_uninit_listen_cpl_handlers(void); int t4_listen_start(struct toedev *, struct tcpcb *); int t4_listen_stop(struct toedev *, struct tcpcb *); void t4_syncache_added(struct toedev *, void *); void t4_syncache_removed(struct toedev *, void *); int t4_syncache_respond(struct toedev *, void *, struct mbuf *); int do_abort_req_synqe(struct sge_iq *, const struct rss_header *, struct mbuf *); int do_abort_rpl_synqe(struct sge_iq *, const struct rss_header *, struct mbuf *); void t4_offload_socket(struct toedev *, void *, struct socket *); void synack_failure_cleanup(struct adapter *, int); /* t4_cpl_io.c */ void aiotx_init_toep(struct toepcb *); int t4_aio_queue_aiotx(struct socket *, struct kaiocb *); void t4_init_cpl_io_handlers(void); void t4_uninit_cpl_io_handlers(void); void send_abort_rpl(struct adapter *, struct sge_ofld_txq *, int , int); void send_flowc_wr(struct toepcb *, struct tcpcb *); void send_reset(struct adapter *, struct toepcb *, uint32_t); int send_rx_credits(struct adapter *, struct toepcb *, int); void send_rx_modulate(struct adapter *, struct toepcb *); void make_established(struct toepcb *, uint32_t, uint32_t, uint16_t); int t4_close_conn(struct adapter *, struct toepcb *); void t4_rcvd(struct toedev *, struct tcpcb *); void t4_rcvd_locked(struct toedev *, struct tcpcb *); int t4_tod_output(struct toedev *, struct tcpcb *); int t4_send_fin(struct toedev *, struct tcpcb *); int t4_send_rst(struct toedev *, struct tcpcb *); void t4_set_tcb_field(struct adapter *, struct sge_wrq *, struct toepcb *, uint16_t, uint64_t, uint64_t, int, int); void t4_push_frames(struct adapter *, struct toepcb *, int); void t4_push_pdus(struct adapter *, struct toepcb *, int); /* t4_ddp.c */ int t4_init_ppod_region(struct ppod_region *, struct t4_range *, u_int, const char *); void t4_free_ppod_region(struct ppod_region *); int t4_alloc_page_pods_for_ps(struct ppod_region *, struct pageset *); int t4_alloc_page_pods_for_buf(struct ppod_region *, vm_offset_t, int, struct ppod_reservation *); int t4_alloc_page_pods_for_sgl(struct ppod_region *, struct ctl_sg_entry *, int, struct ppod_reservation *); int t4_write_page_pods_for_ps(struct adapter *, struct sge_wrq *, int, struct pageset *); int t4_write_page_pods_for_buf(struct adapter *, struct toepcb *, struct ppod_reservation *, vm_offset_t, int, struct mbufq *); int t4_write_page_pods_for_sgl(struct adapter *, struct toepcb *, struct ppod_reservation *, struct ctl_sg_entry *, int, int, struct mbufq *); void t4_free_page_pods(struct ppod_reservation *); int t4_soreceive_ddp(struct socket *, struct sockaddr **, struct uio *, struct mbuf **, struct mbuf **, int *); int t4_aio_queue_ddp(struct socket *, struct kaiocb *); void t4_ddp_mod_load(void); void t4_ddp_mod_unload(void); void ddp_assert_empty(struct toepcb *); void ddp_init_toep(struct toepcb *); void ddp_uninit_toep(struct toepcb *); void ddp_queue_toep(struct toepcb *); void release_ddp_resources(struct toepcb *toep); void handle_ddp_close(struct toepcb *, struct tcpcb *, uint32_t); void handle_ddp_indicate(struct toepcb *); void insert_ddp_data(struct toepcb *, uint32_t); const struct offload_settings *lookup_offload_policy(struct adapter *, int, struct mbuf *, uint16_t, struct inpcb *); /* t4_tls.c */ bool can_tls_offload(struct adapter *); void do_rx_data_tls(const struct cpl_rx_data *, struct toepcb *, struct mbuf *); int t4_ctloutput_tls(struct socket *, struct sockopt *); void t4_push_tls_records(struct adapter *, struct toepcb *, int); void t4_push_ktls(struct adapter *, struct toepcb *, int); void t4_tls_mod_load(void); void t4_tls_mod_unload(void); void tls_detach(struct toepcb *); void tls_establish(struct toepcb *); void tls_init_toep(struct toepcb *); int tls_rx_key(struct toepcb *); void tls_stop_handshake_timer(struct toepcb *); int tls_tx_key(struct toepcb *); void tls_uninit_toep(struct toepcb *); int tls_alloc_ktls(struct toepcb *, struct ktls_session *, int); #endif