diff --git a/sys/dev/cxgbe/offload.h b/sys/dev/cxgbe/offload.h index b17056dc0338..c13901ee1a59 100644 --- a/sys/dev/cxgbe/offload.h +++ b/sys/dev/cxgbe/offload.h @@ -1,250 +1,251 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2010 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #ifndef __T4_OFFLOAD_H__ #define __T4_OFFLOAD_H__ #include #include #include #define INIT_ULPTX_WRH(w, wrlen, atomic, tid) do { \ (w)->wr_hi = htonl(V_FW_WR_OP(FW_ULPTX_WR) | V_FW_WR_ATOMIC(atomic)); \ (w)->wr_mid = htonl(V_FW_WR_LEN16(DIV_ROUND_UP(wrlen, 16)) | \ V_FW_WR_FLOWID(tid)); \ (w)->wr_lo = cpu_to_be64(0); \ } while (0) #define INIT_ULPTX_WR(w, wrlen, atomic, tid) \ INIT_ULPTX_WRH(&((w)->wr), wrlen, atomic, tid) #define INIT_TP_WR(w, tid) do { \ (w)->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) | \ V_FW_WR_IMMDLEN(sizeof(*w) - sizeof(w->wr))); \ (w)->wr.wr_mid = htonl(V_FW_WR_LEN16(DIV_ROUND_UP(sizeof(*w), 16)) | \ V_FW_WR_FLOWID(tid)); \ (w)->wr.wr_lo = cpu_to_be64(0); \ } while (0) #define INIT_TP_WR_MIT_CPL(w, cpl, tid) do { \ INIT_TP_WR(w, tid); \ OPCODE_TID(w) = htonl(MK_OPCODE_TID(cpl, tid)); \ } while (0) TAILQ_HEAD(stid_head, stid_region); struct listen_ctx; struct stid_region { TAILQ_ENTRY(stid_region) link; u_int used; /* # of stids used by this region */ u_int free; /* # of contiguous stids free right after this region */ }; /* * Max # of ATIDs. The absolute HW max is larger than this but we reserve a few * of the upper bits for use as a cookie to demux the reply. */ #define MAX_ATIDS (M_TID_TID + 1) union aopen_entry { void *data; union aopen_entry *next; }; /* cxgbe_rate_tag flags */ enum { EO_FLOWC_PENDING = (1 << 0), /* flowc needs to be sent */ EO_FLOWC_RPL_PENDING = (1 << 1), /* flowc credits due back */ EO_SND_TAG_REF = (1 << 2), /* kernel has a ref on us */ EO_FLUSH_RPL_PENDING = (1 << 3), /* credit flush rpl due back */ }; struct cxgbe_rate_tag { struct m_snd_tag com; struct adapter *adapter; u_int flags; struct mtx lock; int port_id; int etid; struct mbufq pending_tx, pending_fwack; int plen; struct sge_ofld_txq *eo_txq; uint32_t ctrl0; uint16_t iqid; int8_t schedcl; uint64_t max_rate; /* in bytes/s */ uint8_t tx_total; /* total tx WR credits (in 16B units) */ uint8_t tx_credits; /* tx WR credits (in 16B units) available */ uint8_t tx_nocompl; /* tx WR credits since last compl request */ uint8_t ncompl; /* # of completions outstanding. */ }; static inline struct cxgbe_rate_tag * mst_to_crt(struct m_snd_tag *t) { return (__containerof(t, struct cxgbe_rate_tag, com)); } union etid_entry { struct cxgbe_rate_tag *cst; union etid_entry *next; }; /* * Holds the size, base address, start, end, etc. of various types of TIDs. The * tables themselves are allocated dynamically. */ struct tid_info { u_int nstids; u_int stid_base; u_int natids; u_int nftids; u_int ftid_base; u_int ftid_end; u_int nhpftids; u_int hpftid_base; u_int hpftid_end; u_int ntids; u_int tid_base; u_int netids; u_int etid_base; u_int etid_end; struct mtx stid_lock __aligned(CACHE_LINE_SIZE); struct listen_ctx **stid_tab; u_int stids_in_use; u_int nstids_free_head; /* # of available stids at the beginning */ struct stid_head stids; + bool stid_tab_stopped; struct mtx atid_lock __aligned(CACHE_LINE_SIZE); union aopen_entry *atid_tab; union aopen_entry *afree; u_int atids_in_use; bool atid_alloc_stopped; /* High priority filters and normal filters share the lock and cv. */ struct mtx ftid_lock __aligned(CACHE_LINE_SIZE); struct cv ftid_cv; struct filter_entry *ftid_tab; struct filter_entry *hpftid_tab; u_int ftids_in_use; u_int hpftids_in_use; /* * hashfilter and TOE are mutually exclusive and both use ntids and * tids_in_use. The lock and cv are used only by hashfilter. */ struct mtx hftid_lock __aligned(CACHE_LINE_SIZE); struct cv hftid_cv; void **tid_tab; u_int tids_in_use; void *hftid_hash_4t; /* LIST_HEAD(, filter_entry) *hftid_hash_4t; */ u_long hftid_4t_mask; void *hftid_hash_tid; /* LIST_HEAD(, filter_entry) *hftid_hash_tid; */ u_long hftid_tid_mask; struct mtx etid_lock __aligned(CACHE_LINE_SIZE); union etid_entry *etid_tab; union etid_entry *efree; u_int etids_in_use; }; struct t4_range { u_int start; u_int size; }; struct t4_virt_res { /* virtualized HW resources */ struct t4_range ddp; struct t4_range iscsi; struct t4_range stag; struct t4_range rq; struct t4_range pbl; struct t4_range qp; struct t4_range cq; struct t4_range srq; struct t4_range ocq; struct t4_range l2t; struct t4_range key; }; enum { ULD_TOM = 0, ULD_IWARP, ULD_ISCSI, ULD_MAX = ULD_ISCSI }; struct adapter; struct port_info; struct uld_info { int (*uld_activate)(struct adapter *); int (*uld_deactivate)(struct adapter *); int (*uld_stop)(struct adapter *); int (*uld_restart)(struct adapter *); }; struct tom_tunables { int cong_algorithm; int sndbuf; int ddp; int rx_coalesce; int tls; int tx_align; int tx_zcopy; int cop_managed_offloading; int autorcvbuf_inc; int update_hc_on_pmtu_change; int iso; }; /* iWARP driver tunables */ struct iw_tunables { int wc_en; }; struct tls_tunables { int inline_keys; int combo_wrs; }; #ifdef TCP_OFFLOAD int t4_register_uld(struct uld_info *, int); int t4_unregister_uld(struct uld_info *, int); int t4_activate_uld(struct adapter *, int); int t4_deactivate_uld(struct adapter *, int); int uld_active(struct adapter *, int); #endif #endif diff --git a/sys/dev/cxgbe/tom/t4_connect.c b/sys/dev/cxgbe/tom/t4_connect.c index 4e81f23dc267..1c98e70a4df5 100644 --- a/sys/dev/cxgbe/tom/t4_connect.c +++ b/sys/dev/cxgbe/tom/t4_connect.c @@ -1,412 +1,413 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #include "opt_inet6.h" #ifdef TCP_OFFLOAD #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include #include #include #include #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "t4_clip.h" #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" /* * Active open succeeded. */ static int do_act_establish(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_act_establish *cpl = (const void *)(rss + 1); u_int tid = GET_TID(cpl); u_int atid = G_TID_TID(ntohl(cpl->tos_atid)); struct toepcb *toep = lookup_atid(sc, atid); struct inpcb *inp = toep->inp; KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(toep->tid == atid, ("%s: toep tid/atid mismatch", __func__)); CTR3(KTR_CXGBE, "%s: atid %u, tid %u", __func__, atid, tid); free_atid(sc, atid); CURVNET_SET(toep->vnet); INP_WLOCK(inp); toep->tid = tid; insert_tid(sc, tid, toep, inp->inp_vflag & INP_IPV6 ? 2 : 1); if (inp->inp_flags & INP_DROPPED) { /* socket closed by the kernel before hw told us it connected */ send_flowc_wr(toep, NULL); send_reset(sc, toep, be32toh(cpl->snd_isn)); goto done; } make_established(toep, be32toh(cpl->snd_isn) - 1, be32toh(cpl->rcv_isn) - 1, cpl->tcp_opt); inp->inp_flowtype = M_HASHTYPE_OPAQUE; inp->inp_flowid = tid; done: INP_WUNLOCK(inp); CURVNET_RESTORE(); return (0); } void -act_open_failure_cleanup(struct adapter *sc, u_int atid, u_int status) +act_open_failure_cleanup(struct adapter *sc, struct toepcb *toep, u_int status) { - struct toepcb *toep = lookup_atid(sc, atid); struct inpcb *inp = toep->inp; struct toedev *tod = &toep->td->tod; struct epoch_tracker et; - free_atid(sc, atid); - toep->tid = -1; + if (toep->tid >= 0) { + free_atid(sc, toep->tid); + toep->tid = -1; + } CURVNET_SET(toep->vnet); if (status != EAGAIN) NET_EPOCH_ENTER(et); INP_WLOCK(inp); toe_connect_failed(tod, inp, status); final_cpl_received(toep); /* unlocks inp */ if (status != EAGAIN) NET_EPOCH_EXIT(et); CURVNET_RESTORE(); } /* * Active open failed. */ static int do_act_open_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_act_open_rpl *cpl = (const void *)(rss + 1); u_int atid = G_TID_TID(G_AOPEN_ATID(be32toh(cpl->atid_status))); u_int status = G_AOPEN_STATUS(be32toh(cpl->atid_status)); struct toepcb *toep = lookup_atid(sc, atid); int rc; KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(toep->tid == atid, ("%s: toep tid/atid mismatch", __func__)); CTR3(KTR_CXGBE, "%s: atid %u, status %u ", __func__, atid, status); /* Ignore negative advice */ if (negative_advice(status)) return (0); if (status && act_open_has_tid(status)) release_tid(sc, GET_TID(cpl), toep->ctrlq); rc = act_open_rpl_status_to_errno(status); - act_open_failure_cleanup(sc, atid, rc); + act_open_failure_cleanup(sc, toep, rc); return (0); } void t4_init_connect_cpl_handlers(void) { t4_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish); t4_register_shared_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl, CPL_COOKIE_TOM); } void t4_uninit_connect_cpl_handlers(void) { t4_register_cpl_handler(CPL_ACT_ESTABLISH, NULL); t4_register_shared_cpl_handler(CPL_ACT_OPEN_RPL, NULL, CPL_COOKIE_TOM); } #ifdef KTR #define DONT_OFFLOAD_ACTIVE_OPEN(x) do { \ reason = __LINE__; \ rc = (x); \ goto failed; \ } while (0) #else #define DONT_OFFLOAD_ACTIVE_OPEN(x) do { \ rc = (x); \ goto failed; \ } while (0) #endif static inline int act_open_cpl_size(struct adapter *sc, int isipv6) { int idx; static const int sz_table[3][2] = { { sizeof (struct cpl_act_open_req), sizeof (struct cpl_act_open_req6) }, { sizeof (struct cpl_t5_act_open_req), sizeof (struct cpl_t5_act_open_req6) }, { sizeof (struct cpl_t6_act_open_req), sizeof (struct cpl_t6_act_open_req6) }, }; MPASS(chip_id(sc) >= CHELSIO_T4); idx = min(chip_id(sc) - CHELSIO_T4, 2); return (sz_table[idx][!!isipv6]); } /* * active open (soconnect). * * State of affairs on entry: * soisconnecting (so_state |= SS_ISCONNECTING) * tcbinfo not locked (This has changed - used to be WLOCKed) * inp WLOCKed * tp->t_state = TCPS_SYN_SENT * rtalloc1, RT_UNLOCK on rt. */ int t4_connect(struct toedev *tod, struct socket *so, struct nhop_object *nh, struct sockaddr *nam) { struct adapter *sc = tod->tod_softc; struct toepcb *toep = NULL; struct wrqe *wr = NULL; if_t rt_ifp = nh->nh_ifp; struct vi_info *vi; int qid_atid, rc, isipv6; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); #ifdef KTR int reason; #endif struct offload_settings settings; struct epoch_tracker et; uint16_t vid = 0xfff, pcp = 0; INP_WLOCK_ASSERT(inp); KASSERT(nam->sa_family == AF_INET || nam->sa_family == AF_INET6, ("%s: dest addr %p has family %u", __func__, nam, nam->sa_family)); if (if_gettype(rt_ifp) == IFT_ETHER) vi = if_getsoftc(rt_ifp); else if (if_gettype(rt_ifp) == IFT_L2VLAN) { if_t ifp = VLAN_TRUNKDEV(rt_ifp); vi = if_getsoftc(ifp); VLAN_TAG(rt_ifp, &vid); VLAN_PCP(rt_ifp, &pcp); } else if (if_gettype(rt_ifp) == IFT_IEEE8023ADLAG) DONT_OFFLOAD_ACTIVE_OPEN(ENOSYS); /* XXX: implement lagg+TOE */ else DONT_OFFLOAD_ACTIVE_OPEN(ENOTSUP); if (sc->flags & KERN_TLS_ON) DONT_OFFLOAD_ACTIVE_OPEN(ENOTSUP); rw_rlock(&sc->policy_lock); settings = *lookup_offload_policy(sc, OPEN_TYPE_ACTIVE, NULL, EVL_MAKETAG(vid, pcp, 0), inp); rw_runlock(&sc->policy_lock); if (!settings.offload) DONT_OFFLOAD_ACTIVE_OPEN(EPERM); toep = alloc_toepcb(vi, M_NOWAIT); if (toep == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); toep->tid = alloc_atid(sc, toep); if (toep->tid < 0) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); toep->l2te = t4_l2t_get(vi->pi, rt_ifp, nh->nh_flags & NHF_GATEWAY ? &nh->gw_sa : nam); if (toep->l2te == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); toep->vnet = so->so_vnet; init_conn_params(vi, &settings, &inp->inp_inc, so, NULL, toep->l2te->idx, &toep->params); init_toepcb(vi, toep); isipv6 = nam->sa_family == AF_INET6; wr = alloc_wrqe(act_open_cpl_size(sc, isipv6), toep->ctrlq); if (wr == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); qid_atid = V_TID_QID(toep->ofld_rxq->iq.abs_id) | V_TID_TID(toep->tid) | V_TID_COOKIE(CPL_COOKIE_TOM); if (isipv6) { struct cpl_act_open_req6 *cpl = wrtod(wr); struct cpl_t5_act_open_req6 *cpl5 = (void *)cpl; struct cpl_t6_act_open_req6 *cpl6 = (void *)cpl; if ((inp->inp_vflag & INP_IPV6) == 0) DONT_OFFLOAD_ACTIVE_OPEN(ENOTSUP); toep->ce = t4_get_clip_entry(sc, &inp->in6p_laddr, true); if (toep->ce == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOENT); switch (chip_id(sc)) { case CHELSIO_T4: INIT_TP_WR(cpl, 0); cpl->params = select_ntuple(vi, toep->l2te); break; case CHELSIO_T5: INIT_TP_WR(cpl5, 0); cpl5->iss = htobe32(tp->iss); cpl5->params = select_ntuple(vi, toep->l2te); break; case CHELSIO_T6: default: INIT_TP_WR(cpl6, 0); cpl6->iss = htobe32(tp->iss); cpl6->params = select_ntuple(vi, toep->l2te); break; } OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ6, qid_atid)); cpl->local_port = inp->inp_lport; cpl->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0]; cpl->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8]; cpl->peer_port = inp->inp_fport; cpl->peer_ip_hi = *(uint64_t *)&inp->in6p_faddr.s6_addr[0]; cpl->peer_ip_lo = *(uint64_t *)&inp->in6p_faddr.s6_addr[8]; cpl->opt0 = calc_options0(vi, &toep->params); cpl->opt2 = calc_options2(vi, &toep->params); CTR6(KTR_CXGBE, "%s: atid %u, toep %p, inp %p, opt0 %#016lx, opt2 %#08x", __func__, toep->tid, toep, inp, be64toh(cpl->opt0), be32toh(cpl->opt2)); } else { struct cpl_act_open_req *cpl = wrtod(wr); struct cpl_t5_act_open_req *cpl5 = (void *)cpl; struct cpl_t6_act_open_req *cpl6 = (void *)cpl; switch (chip_id(sc)) { case CHELSIO_T4: INIT_TP_WR(cpl, 0); cpl->params = select_ntuple(vi, toep->l2te); break; case CHELSIO_T5: INIT_TP_WR(cpl5, 0); cpl5->iss = htobe32(tp->iss); cpl5->params = select_ntuple(vi, toep->l2te); break; case CHELSIO_T6: default: INIT_TP_WR(cpl6, 0); cpl6->iss = htobe32(tp->iss); cpl6->params = select_ntuple(vi, toep->l2te); break; } OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, qid_atid)); inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port, &cpl->peer_ip, &cpl->peer_port); cpl->opt0 = calc_options0(vi, &toep->params); cpl->opt2 = calc_options2(vi, &toep->params); CTR6(KTR_CXGBE, "%s: atid %u, toep %p, inp %p, opt0 %#016lx, opt2 %#08x", __func__, toep->tid, toep, inp, be64toh(cpl->opt0), be32toh(cpl->opt2)); } offload_socket(so, toep); NET_EPOCH_ENTER(et); rc = t4_l2t_send(sc, wr, toep->l2te); NET_EPOCH_EXIT(et); if (rc == 0) { toep->flags |= TPF_CPL_PENDING; return (0); } undo_offload_socket(so); #if defined(KTR) reason = __LINE__; #endif failed: CTR3(KTR_CXGBE, "%s: not offloading (%d), rc %d", __func__, reason, rc); if (wr) free_wrqe(wr); if (toep) { if (toep->tid >= 0) free_atid(sc, toep->tid); if (toep->l2te) t4_l2t_release(toep->l2te); if (toep->ce) t4_release_clip_entry(sc, toep->ce); free_toepcb(toep); } return (rc); } #endif diff --git a/sys/dev/cxgbe/tom/t4_listen.c b/sys/dev/cxgbe/tom/t4_listen.c index 6e8361734db7..897c5bcaab1e 100644 --- a/sys/dev/cxgbe/tom/t4_listen.c +++ b/sys/dev/cxgbe/tom/t4_listen.c @@ -1,1614 +1,1722 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #include "opt_inet6.h" #ifdef TCP_OFFLOAD #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include #include #include #include #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "t4_clip.h" #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" /* stid services */ static int alloc_stid(struct adapter *, struct listen_ctx *, int); static struct listen_ctx *lookup_stid(struct adapter *, int); static void free_stid(struct adapter *, struct listen_ctx *); /* lctx services */ static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *, struct vi_info *); static int free_lctx(struct adapter *, struct listen_ctx *); static void hold_lctx(struct listen_ctx *); static void listen_hash_add(struct adapter *, struct listen_ctx *); static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *); static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *); static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *); static void send_abort_rpl_synqe(struct toedev *, struct synq_entry *, int); +static int create_server6(struct adapter *, struct listen_ctx *); +static int create_server(struct adapter *, struct listen_ctx *); + +int +alloc_stid_tab(struct adapter *sc) +{ + struct tid_info *t = &sc->tids; + + MPASS(t->nstids > 0); + MPASS(t->stid_tab == NULL); + + t->stid_tab = malloc(t->nstids * sizeof(*t->stid_tab), M_CXGBE, + M_ZERO | M_NOWAIT); + if (t->stid_tab == NULL) + return (ENOMEM); + mtx_init(&t->stid_lock, "stid lock", NULL, MTX_DEF); + t->stids_in_use = 0; + TAILQ_INIT(&t->stids); + t->nstids_free_head = t->nstids; + + return (0); +} + +void +free_stid_tab(struct adapter *sc) +{ + struct tid_info *t = &sc->tids; + + KASSERT(t->stids_in_use == 0, + ("%s: %d tids still in use.", __func__, t->stids_in_use)); + + if (mtx_initialized(&t->stid_lock)) + mtx_destroy(&t->stid_lock); + free(t->stid_tab, M_CXGBE); + t->stid_tab = NULL; +} + +void +stop_stid_tab(struct adapter *sc) +{ + struct tid_info *t = &sc->tids; + struct tom_data *td = sc->tom_softc; + struct listen_ctx *lctx; + struct synq_entry *synqe; + int i, ntids; + + mtx_lock(&t->stid_lock); + t->stid_tab_stopped = true; + mtx_unlock(&t->stid_lock); + + mtx_lock(&td->lctx_hash_lock); + for (i = 0; i <= td->listen_mask; i++) { + LIST_FOREACH(lctx, &td->listen_hash[i], link) + lctx->flags &= ~(LCTX_RPL_PENDING | LCTX_SETUP_IN_HW); + } + mtx_unlock(&td->lctx_hash_lock); + + mtx_lock(&td->toep_list_lock); + TAILQ_FOREACH(synqe, &td->synqe_list, link) { + MPASS(sc->incarnation == synqe->incarnation); + MPASS(synqe->tid >= 0); + MPASS(synqe == lookup_tid(sc, synqe->tid)); + /* Remove tid from the lookup table immediately. */ + CTR(KTR_CXGBE, "%s: tid %d@%d STRANDED, removed from table", + __func__, synqe->tid, synqe->incarnation); + ntids = synqe->lctx->inp->inp_vflag & INP_IPV6 ? 2 : 1; + remove_tid(sc, synqe->tid, ntids); +#if 0 + /* synqe->tid is stale now but left alone for debug. */ + synqe->tid = -1; +#endif + } + MPASS(TAILQ_EMPTY(&td->stranded_synqe)); + TAILQ_CONCAT(&td->stranded_synqe, &td->synqe_list, link); + MPASS(TAILQ_EMPTY(&td->synqe_list)); + mtx_unlock(&td->toep_list_lock); +} + +void +restart_stid_tab(struct adapter *sc) +{ + struct tid_info *t = &sc->tids; + struct tom_data *td = sc->tom_softc; + struct listen_ctx *lctx; + int i; + + mtx_lock(&td->lctx_hash_lock); + for (i = 0; i <= td->listen_mask; i++) { + LIST_FOREACH(lctx, &td->listen_hash[i], link) { + MPASS((lctx->flags & (LCTX_RPL_PENDING | LCTX_SETUP_IN_HW)) == 0); + lctx->flags |= LCTX_RPL_PENDING; + if (lctx->inp->inp_vflag & INP_IPV6) + create_server6(sc, lctx); + else + create_server(sc, lctx); + } + } + mtx_unlock(&td->lctx_hash_lock); + + mtx_lock(&t->stid_lock); + t->stid_tab_stopped = false; + mtx_unlock(&t->stid_lock); + +} + static int alloc_stid(struct adapter *sc, struct listen_ctx *lctx, int isipv6) { struct tid_info *t = &sc->tids; u_int stid, n, f, mask; struct stid_region *sr = &lctx->stid_region; /* * An IPv6 server needs 2 naturally aligned stids (1 stid = 4 cells) in * the TCAM. The start of the stid region is properly aligned (the chip * requires each region to be 128-cell aligned). */ n = isipv6 ? 2 : 1; mask = n - 1; KASSERT((t->stid_base & mask) == 0 && (t->nstids & mask) == 0, ("%s: stid region (%u, %u) not properly aligned. n = %u", __func__, t->stid_base, t->nstids, n)); mtx_lock(&t->stid_lock); - if (n > t->nstids - t->stids_in_use) { + if (n > t->nstids - t->stids_in_use || t->stid_tab_stopped) { mtx_unlock(&t->stid_lock); return (-1); } if (t->nstids_free_head >= n) { /* * This allocation will definitely succeed because the region * starts at a good alignment and we just checked we have enough * stids free. */ f = t->nstids_free_head & mask; t->nstids_free_head -= n + f; stid = t->nstids_free_head; TAILQ_INSERT_HEAD(&t->stids, sr, link); } else { struct stid_region *s; stid = t->nstids_free_head; TAILQ_FOREACH(s, &t->stids, link) { stid += s->used + s->free; f = stid & mask; if (s->free >= n + f) { stid -= n + f; s->free -= n + f; TAILQ_INSERT_AFTER(&t->stids, s, sr, link); goto allocated; } } if (__predict_false(stid != t->nstids)) { panic("%s: stids TAILQ (%p) corrupt." " At %d instead of %d at the end of the queue.", __func__, &t->stids, stid, t->nstids); } mtx_unlock(&t->stid_lock); return (-1); } allocated: sr->used = n; sr->free = f; t->stids_in_use += n; t->stid_tab[stid] = lctx; mtx_unlock(&t->stid_lock); KASSERT(((stid + t->stid_base) & mask) == 0, ("%s: EDOOFUS.", __func__)); return (stid + t->stid_base); } static struct listen_ctx * lookup_stid(struct adapter *sc, int stid) { struct tid_info *t = &sc->tids; return (t->stid_tab[stid - t->stid_base]); } static void free_stid(struct adapter *sc, struct listen_ctx *lctx) { struct tid_info *t = &sc->tids; struct stid_region *sr = &lctx->stid_region; struct stid_region *s; KASSERT(sr->used > 0, ("%s: nonsense free (%d)", __func__, sr->used)); mtx_lock(&t->stid_lock); s = TAILQ_PREV(sr, stid_head, link); if (s != NULL) s->free += sr->used + sr->free; else t->nstids_free_head += sr->used + sr->free; KASSERT(t->stids_in_use >= sr->used, ("%s: stids_in_use (%u) < stids being freed (%u)", __func__, t->stids_in_use, sr->used)); t->stids_in_use -= sr->used; TAILQ_REMOVE(&t->stids, sr, link); mtx_unlock(&t->stid_lock); } static struct listen_ctx * alloc_lctx(struct adapter *sc, struct inpcb *inp, struct vi_info *vi) { struct listen_ctx *lctx; INP_WLOCK_ASSERT(inp); lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO); if (lctx == NULL) return (NULL); lctx->stid = alloc_stid(sc, lctx, inp->inp_vflag & INP_IPV6); if (lctx->stid < 0) { free(lctx, M_CXGBE); return (NULL); } if (inp->inp_vflag & INP_IPV6 && !IN6_ARE_ADDR_EQUAL(&in6addr_any, &inp->in6p_laddr)) { lctx->ce = t4_get_clip_entry(sc, &inp->in6p_laddr, true); if (lctx->ce == NULL) { free(lctx, M_CXGBE); return (NULL); } } lctx->ctrlq = &sc->sge.ctrlq[vi->pi->port_id]; lctx->ofld_rxq = &sc->sge.ofld_rxq[vi->first_ofld_rxq]; refcount_init(&lctx->refcount, 1); lctx->inp = inp; lctx->vnet = inp->inp_socket->so_vnet; in_pcbref(inp); return (lctx); } /* Don't call this directly, use release_lctx instead */ static int free_lctx(struct adapter *sc, struct listen_ctx *lctx) { struct inpcb *inp = lctx->inp; INP_WLOCK_ASSERT(inp); KASSERT(lctx->refcount == 0, ("%s: refcount %d", __func__, lctx->refcount)); KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid)); CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p", __func__, lctx->stid, lctx, lctx->inp); if (lctx->ce) t4_release_clip_entry(sc, lctx->ce); free_stid(sc, lctx); free(lctx, M_CXGBE); return (in_pcbrele_wlocked(inp)); } static void hold_lctx(struct listen_ctx *lctx) { refcount_acquire(&lctx->refcount); } static inline uint32_t listen_hashfn(void *key, u_long mask) { return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask); } /* * Add a listen_ctx entry to the listen hash table. */ static void listen_hash_add(struct adapter *sc, struct listen_ctx *lctx) { struct tom_data *td = sc->tom_softc; int bucket = listen_hashfn(lctx->inp, td->listen_mask); mtx_lock(&td->lctx_hash_lock); LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link); td->lctx_count++; mtx_unlock(&td->lctx_hash_lock); } /* * Look for the listening socket's context entry in the hash and return it. */ static struct listen_ctx * listen_hash_find(struct adapter *sc, struct inpcb *inp) { struct tom_data *td = sc->tom_softc; int bucket = listen_hashfn(inp, td->listen_mask); struct listen_ctx *lctx; mtx_lock(&td->lctx_hash_lock); LIST_FOREACH(lctx, &td->listen_hash[bucket], link) { if (lctx->inp == inp) break; } mtx_unlock(&td->lctx_hash_lock); return (lctx); } /* * Removes the listen_ctx structure for inp from the hash and returns it. */ static struct listen_ctx * listen_hash_del(struct adapter *sc, struct inpcb *inp) { struct tom_data *td = sc->tom_softc; int bucket = listen_hashfn(inp, td->listen_mask); struct listen_ctx *lctx, *l; mtx_lock(&td->lctx_hash_lock); LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) { if (lctx->inp == inp) { LIST_REMOVE(lctx, link); td->lctx_count--; break; } } mtx_unlock(&td->lctx_hash_lock); return (lctx); } /* * Releases a hold on the lctx. Must be called with the listening socket's inp * locked. The inp may be freed by this function and it returns NULL to * indicate this. */ static struct inpcb * release_lctx(struct adapter *sc, struct listen_ctx *lctx) { struct inpcb *inp = lctx->inp; int inp_freed = 0; INP_WLOCK_ASSERT(inp); if (refcount_release(&lctx->refcount)) inp_freed = free_lctx(sc, lctx); return (inp_freed ? NULL : inp); } static void send_flowc_wr_synqe(struct adapter *sc, struct synq_entry *synqe) { struct mbuf *m = synqe->syn; if_t ifp = m->m_pkthdr.rcvif; struct vi_info *vi = if_getsoftc(ifp); struct port_info *pi = vi->pi; struct wrqe *wr; struct fw_flowc_wr *flowc; struct sge_ofld_txq *ofld_txq; struct sge_ofld_rxq *ofld_rxq; const int nparams = 6; const int flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); const u_int pfvf = sc->pf << S_FW_VIID_PFN; INP_WLOCK_ASSERT(synqe->lctx->inp); MPASS((synqe->flags & TPF_FLOWC_WR_SENT) == 0); ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx]; ofld_rxq = &sc->sge.ofld_rxq[synqe->params.rxq_idx]; wr = alloc_wrqe(roundup2(flowclen, 16), &ofld_txq->wrq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } flowc = wrtod(wr); memset(flowc, 0, wr->wr_len); flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | V_FW_FLOWC_WR_NPARAMS(nparams)); flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | V_FW_WR_FLOWID(synqe->tid)); flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN; flowc->mnemval[0].val = htobe32(pfvf); flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH; flowc->mnemval[1].val = htobe32(pi->tx_chan); flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT; flowc->mnemval[2].val = htobe32(pi->tx_chan); flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID; flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id); flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF; flowc->mnemval[4].val = htobe32(512); flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS; flowc->mnemval[5].val = htobe32(512); synqe->flags |= TPF_FLOWC_WR_SENT; t4_wrq_tx(sc, wr); } static void send_abort_rpl_synqe(struct toedev *tod, struct synq_entry *synqe, int rst_status) { struct adapter *sc = tod->tod_softc; struct wrqe *wr; struct cpl_abort_req *req; INP_WLOCK_ASSERT(synqe->lctx->inp); CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s", __func__, synqe, synqe->flags, synqe->tid, synqe->flags & TPF_ABORT_SHUTDOWN ? " (abort already in progress)" : ""); if (synqe->flags & TPF_ABORT_SHUTDOWN) return; /* abort already in progress */ synqe->flags |= TPF_ABORT_SHUTDOWN; if (!(synqe->flags & TPF_FLOWC_WR_SENT)) send_flowc_wr_synqe(sc, synqe); wr = alloc_wrqe(sizeof(*req), &sc->sge.ofld_txq[synqe->params.txq_idx].wrq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } req = wrtod(wr); INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid); req->rsvd0 = 0; /* don't have a snd_nxt */ req->rsvd1 = 1; /* no data sent yet */ req->cmd = rst_status; t4_l2t_send(sc, wr, &sc->l2t->l2tab[synqe->params.l2t_idx]); } static int create_server(struct adapter *sc, struct listen_ctx *lctx) { struct wrqe *wr; struct cpl_pass_open_req *req; struct inpcb *inp = lctx->inp; wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); if (wr == NULL) { log(LOG_ERR, "%s: allocation failure", __func__); return (ENOMEM); } req = wrtod(wr); INIT_TP_WR(req, 0); OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid)); req->local_port = inp->inp_lport; req->peer_port = 0; req->local_ip = inp->inp_laddr.s_addr; req->peer_ip = 0; req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan)); req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) | F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id)); t4_wrq_tx(sc, wr); return (0); } static int create_server6(struct adapter *sc, struct listen_ctx *lctx) { struct wrqe *wr; struct cpl_pass_open_req6 *req; struct inpcb *inp = lctx->inp; wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); if (wr == NULL) { log(LOG_ERR, "%s: allocation failure", __func__); return (ENOMEM); } req = wrtod(wr); INIT_TP_WR(req, 0); OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid)); req->local_port = inp->inp_lport; req->peer_port = 0; req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0]; req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8]; req->peer_ip_hi = 0; req->peer_ip_lo = 0; req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan)); req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) | F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id)); t4_wrq_tx(sc, wr); return (0); } static int destroy_server(struct adapter *sc, struct listen_ctx *lctx) { struct wrqe *wr; struct cpl_close_listsvr_req *req; wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } req = wrtod(wr); INIT_TP_WR(req, 0); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, lctx->stid)); req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id); req->rsvd = htobe16(0); t4_wrq_tx(sc, wr); return (0); } /* * Start a listening server by sending a passive open request to HW. * * Can't take adapter lock here and access to sc->flags, * sc->offload_map, if_capenable are all race prone. */ int t4_listen_start(struct toedev *tod, struct tcpcb *tp) { struct adapter *sc = tod->tod_softc; struct vi_info *vi; struct port_info *pi; struct inpcb *inp = tptoinpcb(tp); struct listen_ctx *lctx; int i, rc, v; struct offload_settings settings; INP_WLOCK_ASSERT(inp); rw_rlock(&sc->policy_lock); settings = *lookup_offload_policy(sc, OPEN_TYPE_LISTEN, NULL, EVL_MAKETAG(0xfff, 0, 0), inp); rw_runlock(&sc->policy_lock); if (!settings.offload) return (0); /* Don't start a hardware listener for any loopback address. */ if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr)) return (0); if (!(inp->inp_vflag & INP_IPV6) && IN_LOOPBACK(ntohl(inp->inp_laddr.s_addr))) return (0); if (sc->flags & KERN_TLS_ON) return (0); #if 0 ADAPTER_LOCK(sc); if (IS_BUSY(sc)) { log(LOG_ERR, "%s: listen request ignored, %s is busy", __func__, device_get_nameunit(sc->dev)); goto done; } KASSERT(uld_active(sc, ULD_TOM), ("%s: TOM not initialized", __func__)); #endif /* * Find an initialized VI with IFCAP_TOE (4 or 6). We'll use the first * such VI's queues to send the passive open and receive the reply to * it. * * XXX: need a way to mark a port in use by offload. if_cxgbe should * then reject any attempt to bring down such a port (and maybe reject * attempts to disable IFCAP_TOE on that port too?). */ for_each_port(sc, i) { pi = sc->port[i]; for_each_vi(pi, v, vi) { if (vi->flags & VI_INIT_DONE && if_getcapenable(vi->ifp) & IFCAP_TOE) goto found; } } goto done; /* no port that's UP with IFCAP_TOE enabled */ found: if (listen_hash_find(sc, inp) != NULL) goto done; /* already setup */ lctx = alloc_lctx(sc, inp, vi); if (lctx == NULL) { log(LOG_ERR, "%s: listen request ignored, %s couldn't allocate lctx\n", __func__, device_get_nameunit(sc->dev)); goto done; } listen_hash_add(sc, lctx); CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x", __func__, lctx->stid, tcpstates[tp->t_state], lctx, inp, inp->inp_vflag); if (inp->inp_vflag & INP_IPV6) rc = create_server6(sc, lctx); else rc = create_server(sc, lctx); if (rc != 0) { log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n", __func__, device_get_nameunit(sc->dev), rc); (void) listen_hash_del(sc, inp); inp = release_lctx(sc, lctx); /* can't be freed, host stack has a reference */ KASSERT(inp != NULL, ("%s: inp freed", __func__)); goto done; } lctx->flags |= LCTX_RPL_PENDING; done: #if 0 ADAPTER_UNLOCK(sc); #endif return (0); } int t4_listen_stop(struct toedev *tod, struct tcpcb *tp) { struct listen_ctx *lctx; struct adapter *sc = tod->tod_softc; struct inpcb *inp = tptoinpcb(tp); INP_WLOCK_ASSERT(inp); lctx = listen_hash_del(sc, inp); if (lctx == NULL) return (ENOENT); /* no hardware listener for this inp */ CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid, lctx, lctx->flags); /* * If the reply to the PASS_OPEN is still pending we'll wait for it to * arrive and clean up when it does. */ if (lctx->flags & LCTX_RPL_PENDING) { return (EINPROGRESS); } - destroy_server(sc, lctx); + if (lctx->flags & LCTX_SETUP_IN_HW) + destroy_server(sc, lctx); return (0); } static inline struct synq_entry * -alloc_synqe(struct adapter *sc __unused, struct listen_ctx *lctx, int flags) +alloc_synqe(struct adapter *sc, struct listen_ctx *lctx, int flags) { struct synq_entry *synqe; INP_RLOCK_ASSERT(lctx->inp); MPASS(flags == M_WAITOK || flags == M_NOWAIT); synqe = malloc(sizeof(*synqe), M_CXGBE, flags); if (__predict_true(synqe != NULL)) { synqe->flags = TPF_SYNQE; + synqe->incarnation = sc->incarnation; refcount_init(&synqe->refcnt, 1); synqe->lctx = lctx; hold_lctx(lctx); /* Every synqe has a ref on its lctx. */ synqe->syn = NULL; } return (synqe); } static inline void hold_synqe(struct synq_entry *synqe) { refcount_acquire(&synqe->refcnt); } static inline struct inpcb * release_synqe(struct adapter *sc, struct synq_entry *synqe) { struct inpcb *inp; MPASS(synqe->flags & TPF_SYNQE); MPASS(synqe->lctx != NULL); inp = synqe->lctx->inp; MPASS(inp != NULL); INP_WLOCK_ASSERT(inp); if (refcount_release(&synqe->refcnt)) { inp = release_lctx(sc, synqe->lctx); m_freem(synqe->syn); free(synqe, M_CXGBE); } return (inp); } void t4_syncache_added(struct toedev *tod __unused, void *arg) { struct synq_entry *synqe = arg; hold_synqe(synqe); } void t4_syncache_removed(struct toedev *tod, void *arg) { struct adapter *sc = tod->tod_softc; struct synq_entry *synqe = arg; struct inpcb *inp = synqe->lctx->inp; /* * XXX: this is a LOR but harmless when running from the softclock. */ INP_WLOCK(inp); inp = release_synqe(sc, synqe); if (inp != NULL) INP_WUNLOCK(inp); } int t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m) { struct synq_entry *synqe = arg; if (atomic_fetchadd_int(&synqe->ok_to_respond, 1) == 0) { struct tcpopt to; struct ip *ip = mtod(m, struct ip *); struct tcphdr *th; if (ip->ip_v == IPVERSION) th = (void *)(ip + 1); else th = (void *)((struct ip6_hdr *)ip + 1); bzero(&to, sizeof(to)); tcp_dooptions(&to, (void *)(th + 1), (th->th_off << 2) - sizeof(*th), TO_SYN); /* save these for later */ synqe->iss = be32toh(th->th_seq); synqe->irs = be32toh(th->th_ack) - 1; synqe->ts = to.to_tsval; } m_freem(m); /* don't need this any more */ return (0); } static int do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1); int stid = GET_TID(cpl); unsigned int status = cpl->status; struct listen_ctx *lctx = lookup_stid(sc, stid); struct inpcb *inp = lctx->inp; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_PASS_OPEN_RPL, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); INP_WLOCK(inp); CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x", __func__, stid, status, lctx->flags); lctx->flags &= ~LCTX_RPL_PENDING; - - if (status != CPL_ERR_NONE) + if (status == CPL_ERR_NONE) + lctx->flags |= LCTX_SETUP_IN_HW; + else log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status); #ifdef INVARIANTS /* * If the inp has been dropped (listening socket closed) then * listen_stop must have run and taken the inp out of the hash. */ if (inp->inp_flags & INP_DROPPED) { KASSERT(listen_hash_del(sc, inp) == NULL, ("%s: inp %p still in listen hash", __func__, inp)); } #endif if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) { if (release_lctx(sc, lctx) != NULL) INP_WUNLOCK(inp); return (status); } /* * Listening socket stopped listening earlier and now the chip tells us * it has started the hardware listener. Stop it; the lctx will be * released in do_close_server_rpl. */ if (inp->inp_flags & INP_DROPPED) { destroy_server(sc, lctx); INP_WUNLOCK(inp); return (status); } /* * Failed to start hardware listener. Take inp out of the hash and * release our reference on it. An error message has been logged * already. */ if (status != CPL_ERR_NONE) { listen_hash_del(sc, inp); if (release_lctx(sc, lctx) != NULL) INP_WUNLOCK(inp); return (status); } /* hardware listener open for business */ INP_WUNLOCK(inp); return (status); } static int do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1); int stid = GET_TID(cpl); unsigned int status = cpl->status; struct listen_ctx *lctx = lookup_stid(sc, stid); struct inpcb *inp = lctx->inp; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status); if (status != CPL_ERR_NONE) { log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n", __func__, status, stid); return (status); } INP_WLOCK(inp); inp = release_lctx(sc, lctx); if (inp != NULL) INP_WUNLOCK(inp); return (status); } static void done_with_synqe(struct adapter *sc, struct synq_entry *synqe) { struct tom_data *td = sc->tom_softc; struct listen_ctx *lctx = synqe->lctx; struct inpcb *inp = lctx->inp; struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx]; int ntids; INP_WLOCK_ASSERT(inp); - ntids = inp->inp_vflag & INP_IPV6 ? 2 : 1; - remove_tid(sc, synqe->tid, ntids); - mtx_lock(&td->toep_list_lock); - TAILQ_REMOVE(&td->synqe_list, synqe, link); - mtx_unlock(&td->toep_list_lock); - release_tid(sc, synqe->tid, lctx->ctrlq); + if (synqe->tid != -1) { + ntids = inp->inp_vflag & INP_IPV6 ? 2 : 1; + remove_tid(sc, synqe->tid, ntids); + mtx_lock(&td->toep_list_lock); + TAILQ_REMOVE(&td->synqe_list, synqe, link); + mtx_unlock(&td->toep_list_lock); + release_tid(sc, synqe->tid, lctx->ctrlq); + } t4_l2t_release(e); inp = release_synqe(sc, synqe); if (inp) INP_WUNLOCK(inp); } void -synack_failure_cleanup(struct adapter *sc, int tid) +synack_failure_cleanup(struct adapter *sc, struct synq_entry *synqe) { - struct synq_entry *synqe = lookup_tid(sc, tid); - INP_WLOCK(synqe->lctx->inp); done_with_synqe(sc, synqe); } int do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct synq_entry *synqe = lookup_tid(sc, tid); struct listen_ctx *lctx = synqe->lctx; struct inpcb *inp = lctx->inp; struct sge_ofld_txq *ofld_txq; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_ABORT_REQ_RSS, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__)); CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d", __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status); if (negative_advice(cpl->status)) return (0); /* Ignore negative advice */ INP_WLOCK(inp); ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx]; if (!(synqe->flags & TPF_FLOWC_WR_SENT)) send_flowc_wr_synqe(sc, synqe); /* * If we'd initiated an abort earlier the reply to it is responsible for * cleaning up resources. Otherwise we tear everything down right here * right now. We owe the T4 a CPL_ABORT_RPL no matter what. */ if (synqe->flags & TPF_ABORT_SHUTDOWN) { INP_WUNLOCK(inp); goto done; } done_with_synqe(sc, synqe); /* inp lock released by done_with_synqe */ done: send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); return (0); } int do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct synq_entry *synqe = lookup_tid(sc, tid); struct listen_ctx *lctx = synqe->lctx; struct inpcb *inp = lctx->inp; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_ABORT_RPL_RSS, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__)); CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d", __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status); INP_WLOCK(inp); KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN, ("%s: wasn't expecting abort reply for synqe %p (0x%x)", __func__, synqe, synqe->flags)); done_with_synqe(sc, synqe); /* inp lock released by done_with_synqe */ return (0); } void t4_offload_socket(struct toedev *tod, void *arg, struct socket *so) { struct adapter *sc = tod->tod_softc; struct tom_data *td = sc->tom_softc; struct synq_entry *synqe = arg; struct inpcb *inp = sotoinpcb(so); struct toepcb *toep = synqe->toep; NET_EPOCH_ASSERT(); /* prevents bad race with accept() */ INP_WLOCK_ASSERT(inp); KASSERT(synqe->flags & TPF_SYNQE, ("%s: %p not a synq_entry?", __func__, arg)); MPASS(toep->tid == synqe->tid); offload_socket(so, toep); make_established(toep, synqe->iss, synqe->irs, synqe->tcp_opt); toep->flags |= TPF_CPL_PENDING; update_tid(sc, synqe->tid, toep); synqe->flags |= TPF_SYNQE_EXPANDED; mtx_lock(&td->toep_list_lock); TAILQ_REMOVE(&td->synqe_list, synqe, link); mtx_unlock(&td->toep_list_lock); inp->inp_flowtype = (inp->inp_vflag & INP_IPV6) ? M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4; inp->inp_flowid = synqe->rss_hash; } static void t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to) { bzero(to, sizeof(*to)); if (t4opt->mss) { to->to_flags |= TOF_MSS; to->to_mss = be16toh(t4opt->mss); } if (t4opt->wsf > 0 && t4opt->wsf < 15) { to->to_flags |= TOF_SCALE; to->to_wscale = t4opt->wsf; } if (t4opt->tstamp) to->to_flags |= TOF_TS; if (t4opt->sack) to->to_flags |= TOF_SACKPERM; } static bool encapsulated_syn(struct adapter *sc, const struct cpl_pass_accept_req *cpl) { u_int hlen = be32toh(cpl->hdr_len); if (chip_id(sc) >= CHELSIO_T6) return (G_T6_ETH_HDR_LEN(hlen) > sizeof(struct ether_vlan_header)); else return (G_ETH_HDR_LEN(hlen) > sizeof(struct ether_vlan_header)); } static void pass_accept_req_to_protohdrs(struct adapter *sc, const struct mbuf *m, struct in_conninfo *inc, struct tcphdr *th, uint8_t *iptos) { const struct cpl_pass_accept_req *cpl = mtod(m, const void *); const struct ether_header *eh; unsigned int hlen = be32toh(cpl->hdr_len); uintptr_t l3hdr; const struct tcphdr *tcp; eh = (const void *)(cpl + 1); if (chip_id(sc) >= CHELSIO_T6) { l3hdr = ((uintptr_t)eh + G_T6_ETH_HDR_LEN(hlen)); tcp = (const void *)(l3hdr + G_T6_IP_HDR_LEN(hlen)); } else { l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen)); tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen)); } /* extract TOS (DiffServ + ECN) byte for AccECN */ if (iptos) { if (((struct ip *)l3hdr)->ip_v == IPVERSION) { const struct ip *ip = (const void *)l3hdr; *iptos = ip->ip_tos; } #ifdef INET6 else if (((struct ip *)l3hdr)->ip_v == (IPV6_VERSION >> 4)) { const struct ip6_hdr *ip6 = (const void *)l3hdr; *iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; } #endif /* INET */ } if (inc) { bzero(inc, sizeof(*inc)); inc->inc_fport = tcp->th_sport; inc->inc_lport = tcp->th_dport; if (((struct ip *)l3hdr)->ip_v == IPVERSION) { const struct ip *ip = (const void *)l3hdr; inc->inc_faddr = ip->ip_src; inc->inc_laddr = ip->ip_dst; } else { const struct ip6_hdr *ip6 = (const void *)l3hdr; inc->inc_flags |= INC_ISIPV6; inc->inc6_faddr = ip6->ip6_src; inc->inc6_laddr = ip6->ip6_dst; } } if (th) { bcopy(tcp, th, sizeof(*th)); tcp_fields_to_host(th); /* just like tcp_input */ } } static struct l2t_entry * get_l2te_for_nexthop(struct port_info *pi, if_t ifp, struct in_conninfo *inc) { struct l2t_entry *e; struct sockaddr_in6 sin6; struct sockaddr *dst = (void *)&sin6; struct nhop_object *nh; if (inc->inc_flags & INC_ISIPV6) { bzero(dst, sizeof(struct sockaddr_in6)); dst->sa_len = sizeof(struct sockaddr_in6); dst->sa_family = AF_INET6; if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) { /* no need for route lookup */ e = t4_l2t_get(pi, ifp, dst); return (e); } nh = fib6_lookup(RT_DEFAULT_FIB, &inc->inc6_faddr, 0, NHR_NONE, 0); if (nh == NULL) return (NULL); if (nh->nh_ifp != ifp) return (NULL); if (nh->nh_flags & NHF_GATEWAY) ((struct sockaddr_in6 *)dst)->sin6_addr = nh->gw6_sa.sin6_addr; else ((struct sockaddr_in6 *)dst)->sin6_addr = inc->inc6_faddr; } else { dst->sa_len = sizeof(struct sockaddr_in); dst->sa_family = AF_INET; nh = fib4_lookup(RT_DEFAULT_FIB, inc->inc_faddr, 0, NHR_NONE, 0); if (nh == NULL) return (NULL); if (nh->nh_ifp != ifp) return (NULL); if (nh->nh_flags & NHF_GATEWAY) if (nh->gw_sa.sa_family == AF_INET) ((struct sockaddr_in *)dst)->sin_addr = nh->gw4_sa.sin_addr; else *((struct sockaddr_in6 *)dst) = nh->gw6_sa; else ((struct sockaddr_in *)dst)->sin_addr = inc->inc_faddr; } e = t4_l2t_get(pi, ifp, dst); return (e); } static int send_synack(struct adapter *sc, struct synq_entry *synqe, uint64_t opt0, uint32_t opt2, int tid) { struct wrqe *wr; struct cpl_pass_accept_rpl *rpl; struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx]; wr = alloc_wrqe(is_t4(sc) ? sizeof(struct cpl_pass_accept_rpl) : sizeof(struct cpl_t5_pass_accept_rpl), &sc->sge.ctrlq[0]); if (wr == NULL) return (ENOMEM); rpl = wrtod(wr); if (is_t4(sc)) INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid); else { struct cpl_t5_pass_accept_rpl *rpl5 = (void *)rpl; INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid); rpl5->iss = htobe32(synqe->iss); } rpl->opt0 = opt0; rpl->opt2 = opt2; return (t4_l2t_send(sc, wr, e)); } #define REJECT_PASS_ACCEPT_REQ(tunnel) do { \ if (!tunnel) { \ m_freem(m); \ m = NULL; \ } \ reject_reason = __LINE__; \ goto reject; \ } while (0) /* * The context associated with a tid entry via insert_tid could be a synq_entry * or a toepcb. The only way CPL handlers can tell is via a bit in these flags. */ CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags)); /* * Incoming SYN on a listening socket. * * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe, * etc. */ static int do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; struct tom_data *td = sc->tom_softc; struct toedev *tod; const struct cpl_pass_accept_req *cpl = mtod(m, const void *); unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid)); unsigned int tid = GET_TID(cpl); struct listen_ctx *lctx = lookup_stid(sc, stid); struct inpcb *inp; struct socket *so; struct in_conninfo inc; struct tcphdr th; struct tcpopt to; struct port_info *pi; struct vi_info *vi; if_t hw_ifp, ifp; struct l2t_entry *e = NULL; struct synq_entry *synqe = NULL; int reject_reason, v, ntids; uint16_t vid, l2info; struct epoch_tracker et; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif struct offload_settings settings; uint8_t iptos; KASSERT(opcode == CPL_PASS_ACCEPT_REQ, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid, lctx); /* * Figure out the port the SYN arrived on. We'll look for an exact VI * match in a bit but in case we don't find any we'll use the main VI as * the incoming ifnet. */ l2info = be16toh(cpl->l2info); pi = sc->port[G_SYN_INTF(l2info)]; hw_ifp = pi->vi[0].ifp; m->m_pkthdr.rcvif = hw_ifp; CURVNET_SET(lctx->vnet); /* before any potential REJECT */ /* * If VXLAN/NVGRE parsing is enabled then SYNs in the inner traffic will * also hit the listener. We don't want to offload those. */ if (encapsulated_syn(sc, cpl)) { REJECT_PASS_ACCEPT_REQ(true); } /* * Use the MAC index to lookup the associated VI. If this SYN didn't * match a perfect MAC filter, punt. */ if (!(l2info & F_SYN_XACT_MATCH)) { REJECT_PASS_ACCEPT_REQ(true); } for_each_vi(pi, v, vi) { if (vi->xact_addr_filt == G_SYN_MAC_IDX(l2info)) goto found; } REJECT_PASS_ACCEPT_REQ(true); found: hw_ifp = vi->ifp; /* the cxgbe ifnet */ m->m_pkthdr.rcvif = hw_ifp; tod = TOEDEV(hw_ifp); /* * Don't offload if the peer requested a TCP option that's not known to * the silicon. Send the SYN to the kernel instead. */ if (__predict_false(cpl->tcpopt.unknown)) REJECT_PASS_ACCEPT_REQ(true); /* * Figure out if there is a pseudo interface (vlan, lagg, etc.) * involved. Don't offload if the SYN had a VLAN tag and the vid * doesn't match anything on this interface. * * XXX: lagg support, lagg + vlan support. */ vid = EVL_VLANOFTAG(be16toh(cpl->vlan)); if (vid != 0xfff && vid != 0) { ifp = VLAN_DEVAT(hw_ifp, vid); if (ifp == NULL) REJECT_PASS_ACCEPT_REQ(true); } else ifp = hw_ifp; /* * Don't offload if the ifnet that the SYN came in on is not in the same * vnet as the listening socket. */ if (lctx->vnet != if_getvnet(ifp)) REJECT_PASS_ACCEPT_REQ(true); pass_accept_req_to_protohdrs(sc, m, &inc, &th, &iptos); if (inc.inc_flags & INC_ISIPV6) { /* Don't offload if the ifcap isn't enabled */ if ((if_getcapenable(ifp) & IFCAP_TOE6) == 0) REJECT_PASS_ACCEPT_REQ(true); /* * SYN must be directed to an IP6 address on this ifnet. This * is more restrictive than in6_localip. */ NET_EPOCH_ENTER(et); if (!in6_ifhasaddr(ifp, &inc.inc6_laddr)) { NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(true); } ntids = 2; } else { /* Don't offload if the ifcap isn't enabled */ if ((if_getcapenable(ifp) & IFCAP_TOE4) == 0) REJECT_PASS_ACCEPT_REQ(true); /* * SYN must be directed to an IP address on this ifnet. This * is more restrictive than in_localip. */ NET_EPOCH_ENTER(et); if (!in_ifhasaddr(ifp, inc.inc_laddr)) { NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(true); } ntids = 1; } e = get_l2te_for_nexthop(pi, ifp, &inc); if (e == NULL) { NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(true); } /* Don't offload if the 4-tuple is already in use */ if (toe_4tuple_check(&inc, &th, ifp) != 0) { NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(false); } inp = lctx->inp; /* listening socket, not owned by TOE */ INP_RLOCK(inp); /* Don't offload if the listening socket has closed */ if (__predict_false(inp->inp_flags & INP_DROPPED)) { INP_RUNLOCK(inp); NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(false); } so = inp->inp_socket; rw_rlock(&sc->policy_lock); settings = *lookup_offload_policy(sc, OPEN_TYPE_PASSIVE, m, EVL_MAKETAG(0xfff, 0, 0), inp); rw_runlock(&sc->policy_lock); if (!settings.offload) { INP_RUNLOCK(inp); NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(true); /* Rejected by COP. */ } synqe = alloc_synqe(sc, lctx, M_NOWAIT); if (synqe == NULL) { INP_RUNLOCK(inp); NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(true); } MPASS(rss->hash_type == RSS_HASH_TCP); synqe->rss_hash = be32toh(rss->hash_val); atomic_store_int(&synqe->ok_to_respond, 0); init_conn_params(vi, &settings, &inc, so, &cpl->tcpopt, e->idx, &synqe->params); /* * If all goes well t4_syncache_respond will get called during * syncache_add. Note that syncache_add releases the pcb lock. */ t4opt_to_tcpopt(&cpl->tcpopt, &to); toe_syncache_add(&inc, &to, &th, inp, tod, synqe, iptos); if (atomic_load_int(&synqe->ok_to_respond) > 0) { uint64_t opt0; uint32_t opt2; opt0 = calc_options0(vi, &synqe->params); opt2 = calc_options2(vi, &synqe->params); insert_tid(sc, tid, synqe, ntids); synqe->tid = tid; synqe->syn = m; m = NULL; if (send_synack(sc, synqe, opt0, opt2, tid) != 0) { remove_tid(sc, tid, ntids); m = synqe->syn; synqe->syn = NULL; NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(true); } mtx_lock(&td->toep_list_lock); TAILQ_INSERT_TAIL(&td->synqe_list, synqe, link); mtx_unlock(&td->toep_list_lock); CTR6(KTR_CXGBE, "%s: stid %u, tid %u, synqe %p, opt0 %#016lx, opt2 %#08x", __func__, stid, tid, synqe, be64toh(opt0), be32toh(opt2)); } else { NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(false); } NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); reject: CURVNET_RESTORE(); CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid, reject_reason); if (e) t4_l2t_release(e); release_tid(sc, tid, lctx->ctrlq); if (synqe) { inp = synqe->lctx->inp; INP_WLOCK(inp); inp = release_synqe(sc, synqe); if (inp) INP_WUNLOCK(inp); } if (m) { /* * The connection request hit a TOE listener but is being passed * on to the kernel sw stack instead of getting offloaded. */ m_adj(m, sizeof(*cpl)); m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m->m_pkthdr.csum_data = 0xffff; if_input(hw_ifp, m); } return (reject_reason); } static void synqe_to_protohdrs(struct adapter *sc, struct synq_entry *synqe, const struct cpl_pass_establish *cpl, struct in_conninfo *inc, struct tcphdr *th, struct tcpopt *to) { uint16_t tcp_opt = be16toh(cpl->tcp_opt); uint8_t iptos; /* start off with the original SYN */ pass_accept_req_to_protohdrs(sc, synqe->syn, inc, th, &iptos); /* modify parts to make it look like the ACK to our SYN|ACK */ th->th_flags = TH_ACK; th->th_ack = synqe->iss + 1; th->th_seq = be32toh(cpl->rcv_isn); bzero(to, sizeof(*to)); if (G_TCPOPT_TSTAMP(tcp_opt)) { to->to_flags |= TOF_TS; to->to_tsecr = synqe->ts; } } static int do_pass_establish(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; struct vi_info *vi; if_t ifp; const struct cpl_pass_establish *cpl = (const void *)(rss + 1); #if defined(KTR) || defined(INVARIANTS) unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid)); #endif unsigned int tid = GET_TID(cpl); struct synq_entry *synqe = lookup_tid(sc, tid); struct listen_ctx *lctx = synqe->lctx; struct inpcb *inp = lctx->inp, *new_inp; struct socket *so; struct tcphdr th; struct tcpopt to; struct in_conninfo inc; struct toepcb *toep; struct epoch_tracker et; int rstreason; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_PASS_ESTABLISH, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); KASSERT(synqe->flags & TPF_SYNQE, ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe)); CURVNET_SET(lctx->vnet); NET_EPOCH_ENTER(et); /* for syncache_expand */ INP_WLOCK(inp); CTR6(KTR_CXGBE, "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x", __func__, stid, tid, synqe, synqe->flags, inp->inp_flags); ifp = synqe->syn->m_pkthdr.rcvif; vi = if_getsoftc(ifp); KASSERT(vi->adapter == sc, ("%s: vi %p, sc %p mismatch", __func__, vi, sc)); if (__predict_false(inp->inp_flags & INP_DROPPED)) { reset: send_abort_rpl_synqe(TOEDEV(ifp), synqe, CPL_ABORT_SEND_RST); INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } KASSERT(synqe->params.rxq_idx == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0], ("%s: CPL arrived on unexpected rxq. %d %d", __func__, synqe->params.rxq_idx, (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0]))); toep = alloc_toepcb(vi, M_NOWAIT); if (toep == NULL) goto reset; toep->tid = tid; toep->l2te = &sc->l2t->l2tab[synqe->params.l2t_idx]; toep->vnet = lctx->vnet; bcopy(&synqe->params, &toep->params, sizeof(toep->params)); init_toepcb(vi, toep); MPASS(be32toh(cpl->snd_isn) - 1 == synqe->iss); MPASS(be32toh(cpl->rcv_isn) - 1 == synqe->irs); synqe->tcp_opt = cpl->tcp_opt; synqe->toep = toep; /* Come up with something that syncache_expand should be ok with. */ synqe_to_protohdrs(sc, synqe, cpl, &inc, &th, &to); if (inc.inc_flags & INC_ISIPV6) { if (lctx->ce == NULL) { toep->ce = t4_get_clip_entry(sc, &inc.inc6_laddr, true); if (toep->ce == NULL) { free_toepcb(toep); goto reset; /* RST without a CLIP entry? */ } } else { t4_hold_clip_entry(sc, lctx->ce); toep->ce = lctx->ce; } } so = inp->inp_socket; KASSERT(so != NULL, ("%s: socket is NULL", __func__)); rstreason = toe_syncache_expand(&inc, &to, &th, &so); if (rstreason < 0) { free_toepcb(toep); send_abort_rpl_synqe(TOEDEV(ifp), synqe, CPL_ABORT_NO_RST); INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } else if (rstreason == 0 || so == NULL) { free_toepcb(toep); goto reset; } /* New connection inpcb is already locked by syncache_expand(). */ new_inp = sotoinpcb(so); INP_WLOCK_ASSERT(new_inp); MPASS(so->so_vnet == lctx->vnet); /* * This is for expansion from syncookies. * * XXX: we've held the tcbinfo lock throughout so there's no risk of * anyone accept'ing a connection before we've installed our hooks, but * this somewhat defeats the purpose of having a tod_offload_socket :-( */ if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) { tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0); t4_offload_socket(TOEDEV(ifp), synqe, so); } INP_WUNLOCK(new_inp); /* Done with the synqe */ inp = release_synqe(sc, synqe); if (inp != NULL) INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } void t4_init_listen_cpl_handlers(void) { t4_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl); t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl); t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); t4_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish); } void t4_uninit_listen_cpl_handlers(void) { t4_register_cpl_handler(CPL_PASS_OPEN_RPL, NULL); t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, NULL); t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, NULL); t4_register_cpl_handler(CPL_PASS_ESTABLISH, NULL); } #endif diff --git a/sys/dev/cxgbe/tom/t4_tom.c b/sys/dev/cxgbe/tom/t4_tom.c index fb92c88aa358..58a77ff93c86 100644 --- a/sys/dev/cxgbe/tom/t4_tom.c +++ b/sys/dev/cxgbe/tom/t4_tom.c @@ -1,2088 +1,2302 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_ratelimit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include #include #include #include #include #include #ifdef TCP_OFFLOAD #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "common/t4_tcb.h" #include "t4_clip.h" #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" #include "tom/t4_tls.h" static struct protosw toe_protosw; static struct protosw toe6_protosw; /* Module ops */ static int t4_tom_mod_load(void); static int t4_tom_mod_unload(void); static int t4_tom_modevent(module_t, int, void *); /* ULD ops and helpers */ static int t4_tom_activate(struct adapter *); static int t4_tom_deactivate(struct adapter *); +static int t4_tom_stop(struct adapter *); +static int t4_tom_restart(struct adapter *); static struct uld_info tom_uld_info = { .uld_activate = t4_tom_activate, .uld_deactivate = t4_tom_deactivate, + .uld_stop = t4_tom_stop, + .uld_restart = t4_tom_restart, }; static void release_offload_resources(struct toepcb *); static void done_with_toepcb(struct toepcb *); -static int alloc_tid_tabs(struct tid_info *); -static void free_tid_tabs(struct tid_info *); +static int alloc_tid_tabs(struct adapter *); +static void free_tid_tabs(struct adapter *); static void free_tom_data(struct adapter *, struct tom_data *); static void reclaim_wr_resources(void *, int); +static void cleanup_stranded_tids(void *, int); struct toepcb * alloc_toepcb(struct vi_info *vi, int flags) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct toepcb *toep; int tx_credits, txsd_total, len; /* * The firmware counts tx work request credits in units of 16 bytes * each. Reserve room for an ABORT_REQ so the driver never has to worry * about tx credits if it wants to abort a connection. */ tx_credits = sc->params.ofldq_wr_cred; tx_credits -= howmany(sizeof(struct cpl_abort_req), 16); /* * Shortest possible tx work request is a fw_ofld_tx_data_wr + 1 byte * immediate payload, and firmware counts tx work request credits in * units of 16 byte. Calculate the maximum work requests possible. */ txsd_total = tx_credits / howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16); len = offsetof(struct toepcb, txsd) + txsd_total * sizeof(struct ofld_tx_sdesc); toep = malloc(len, M_CXGBE, M_ZERO | flags); if (toep == NULL) return (NULL); refcount_init(&toep->refcount, 1); toep->td = sc->tom_softc; + toep->incarnation = sc->incarnation; toep->vi = vi; toep->tid = -1; toep->tx_total = tx_credits; toep->tx_credits = tx_credits; mbufq_init(&toep->ulp_pduq, INT_MAX); mbufq_init(&toep->ulp_pdu_reclaimq, INT_MAX); toep->txsd_total = txsd_total; toep->txsd_avail = txsd_total; toep->txsd_pidx = 0; toep->txsd_cidx = 0; aiotx_init_toep(toep); return (toep); } /* * Initialize a toepcb after its params have been filled out. */ int init_toepcb(struct vi_info *vi, struct toepcb *toep) { struct conn_params *cp = &toep->params; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct tx_cl_rl_params *tc; if (cp->tc_idx >= 0 && cp->tc_idx < sc->params.nsched_cls) { tc = &pi->sched_params->cl_rl[cp->tc_idx]; mtx_lock(&sc->tc_lock); if (tc->state != CS_HW_CONFIGURED) { CH_ERR(vi, "tid %d cannot be bound to traffic class %d " "because it is not configured (its state is %d)\n", toep->tid, cp->tc_idx, tc->state); cp->tc_idx = -1; } else { tc->refcount++; } mtx_unlock(&sc->tc_lock); } toep->ofld_txq = &sc->sge.ofld_txq[cp->txq_idx]; toep->ofld_rxq = &sc->sge.ofld_rxq[cp->rxq_idx]; toep->ctrlq = &sc->sge.ctrlq[pi->port_id]; tls_init_toep(toep); MPASS(ulp_mode(toep) != ULP_MODE_TCPDDP); toep->flags |= TPF_INITIALIZED; return (0); } struct toepcb * hold_toepcb(struct toepcb *toep) { refcount_acquire(&toep->refcount); return (toep); } void free_toepcb(struct toepcb *toep) { if (refcount_release(&toep->refcount) == 0) return; KASSERT(!(toep->flags & TPF_ATTACHED), ("%s: attached to an inpcb", __func__)); KASSERT(!(toep->flags & TPF_CPL_PENDING), ("%s: CPL pending", __func__)); if (toep->flags & TPF_INITIALIZED) { if (ulp_mode(toep) == ULP_MODE_TCPDDP) ddp_uninit_toep(toep); tls_uninit_toep(toep); } free(toep, M_CXGBE); } /* * Set up the socket for TCP offload. */ void offload_socket(struct socket *so, struct toepcb *toep) { struct tom_data *td = toep->td; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct sockbuf *sb; INP_WLOCK_ASSERT(inp); /* Update socket */ sb = &so->so_snd; SOCKBUF_LOCK(sb); sb->sb_flags |= SB_NOCOALESCE; SOCKBUF_UNLOCK(sb); sb = &so->so_rcv; SOCKBUF_LOCK(sb); sb->sb_flags |= SB_NOCOALESCE; if (inp->inp_vflag & INP_IPV6) so->so_proto = &toe6_protosw; else so->so_proto = &toe_protosw; SOCKBUF_UNLOCK(sb); /* Update TCP PCB */ tp->tod = &td->tod; tp->t_toe = toep; tp->t_flags |= TF_TOE; /* Install an extra hold on inp */ toep->inp = inp; toep->flags |= TPF_ATTACHED; in_pcbref(inp); /* Add the TOE PCB to the active list */ mtx_lock(&td->toep_list_lock); TAILQ_INSERT_HEAD(&td->toep_list, toep, link); + toep->flags |= TPF_IN_TOEP_LIST; mtx_unlock(&td->toep_list_lock); } void restore_so_proto(struct socket *so, bool v6) { if (v6) so->so_proto = &tcp6_protosw; else so->so_proto = &tcp_protosw; } /* This is _not_ the normal way to "unoffload" a socket. */ void undo_offload_socket(struct socket *so) { struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct toepcb *toep = tp->t_toe; struct tom_data *td = toep->td; struct sockbuf *sb; INP_WLOCK_ASSERT(inp); sb = &so->so_snd; SOCKBUF_LOCK(sb); sb->sb_flags &= ~SB_NOCOALESCE; SOCKBUF_UNLOCK(sb); sb = &so->so_rcv; SOCKBUF_LOCK(sb); sb->sb_flags &= ~SB_NOCOALESCE; restore_so_proto(so, inp->inp_vflag & INP_IPV6); SOCKBUF_UNLOCK(sb); tp->tod = NULL; tp->t_toe = NULL; tp->t_flags &= ~TF_TOE; toep->inp = NULL; toep->flags &= ~TPF_ATTACHED; if (in_pcbrele_wlocked(inp)) panic("%s: inp freed.", __func__); mtx_lock(&td->toep_list_lock); + toep->flags &= ~TPF_IN_TOEP_LIST; TAILQ_REMOVE(&td->toep_list, toep, link); mtx_unlock(&td->toep_list_lock); } static void release_offload_resources(struct toepcb *toep) { struct tom_data *td = toep->td; struct adapter *sc = td_adapter(td); int tid = toep->tid; KASSERT(!(toep->flags & TPF_CPL_PENDING), ("%s: %p has CPL pending.", __func__, toep)); CTR5(KTR_CXGBE, "%s: toep %p (tid %d, l2te %p, ce %p)", __func__, toep, tid, toep->l2te, toep->ce); if (toep->l2te) { t4_l2t_release(toep->l2te); toep->l2te = NULL; } if (tid >= 0) { remove_tid(sc, tid, toep->ce ? 2 : 1); release_tid(sc, tid, toep->ctrlq); toep->tid = -1; } if (toep->ce) { t4_release_clip_entry(sc, toep->ce); toep->ce = NULL; } if (toep->params.tc_idx != -1) t4_release_cl_rl(sc, toep->vi->pi->port_id, toep->params.tc_idx); } /* * Both the driver and kernel are done with the toepcb. */ static void done_with_toepcb(struct toepcb *toep) { struct tom_data *td = toep->td; KASSERT(!(toep->flags & TPF_CPL_PENDING), ("%s: %p has CPL pending.", __func__, toep)); KASSERT(!(toep->flags & TPF_ATTACHED), ("%s: %p is still attached.", __func__, toep)); CTR(KTR_CXGBE, "%s: toep %p (0x%x)", __func__, toep, toep->flags); /* * These queues should have been emptied at approximately the same time * that a normal connection's socket's so_snd would have been purged or * drained. Do _not_ clean up here. */ MPASS(mbufq_empty(&toep->ulp_pduq)); MPASS(mbufq_empty(&toep->ulp_pdu_reclaimq)); #ifdef INVARIANTS if (ulp_mode(toep) == ULP_MODE_TCPDDP) ddp_assert_empty(toep); #endif MPASS(TAILQ_EMPTY(&toep->aiotx_jobq)); MPASS(toep->tid == -1); MPASS(toep->l2te == NULL); MPASS(toep->ce == NULL); mtx_lock(&td->toep_list_lock); - TAILQ_REMOVE(&td->toep_list, toep, link); + if (toep->flags & TPF_IN_TOEP_LIST) { + toep->flags &= ~TPF_IN_TOEP_LIST; + TAILQ_REMOVE(&td->toep_list, toep, link); + } mtx_unlock(&td->toep_list_lock); free_toepcb(toep); } /* * The kernel is done with the TCP PCB and this is our opportunity to unhook the * toepcb hanging off of it. If the TOE driver is also done with the toepcb (no * pending CPL) then it is time to release all resources tied to the toepcb. * * Also gets called when an offloaded active open fails and the TOM wants the * kernel to take the TCP PCB back. */ void t4_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp) { #if defined(KTR) || defined(INVARIANTS) struct inpcb *inp = tptoinpcb(tp); #endif struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(inp); KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); KASSERT(toep->flags & TPF_ATTACHED, ("%s: not attached", __func__)); #ifdef KTR if (tp->t_state == TCPS_SYN_SENT) { CTR6(KTR_CXGBE, "%s: atid %d, toep %p (0x%x), inp %p (0x%x)", __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags); } else { CTR6(KTR_CXGBE, "t4_pcb_detach: tid %d (%s), toep %p (0x%x), inp %p (0x%x)", toep->tid, tcpstates[tp->t_state], toep, toep->flags, inp, inp->inp_flags); } #endif tp->tod = NULL; tp->t_toe = NULL; tp->t_flags &= ~TF_TOE; toep->flags &= ~TPF_ATTACHED; if (!(toep->flags & TPF_CPL_PENDING)) done_with_toepcb(toep); } /* * setsockopt handler. */ static void t4_ctloutput(struct toedev *tod, struct tcpcb *tp, int dir, int name) { struct adapter *sc = tod->tod_softc; struct toepcb *toep = tp->t_toe; if (dir == SOPT_GET) return; CTR4(KTR_CXGBE, "%s: tp %p, dir %u, name %u", __func__, tp, dir, name); switch (name) { case TCP_NODELAY: if (tp->t_state != TCPS_ESTABLISHED) break; toep->params.nagle = tp->t_flags & TF_NODELAY ? 0 : 1; t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS, V_TF_NAGLE(1), V_TF_NAGLE(toep->params.nagle), 0, 0); break; default: break; } } static inline uint64_t get_tcb_tflags(const uint64_t *tcb) { return ((be64toh(tcb[14]) << 32) | (be64toh(tcb[15]) >> 32)); } static inline uint32_t get_tcb_field(const uint64_t *tcb, u_int word, uint32_t mask, u_int shift) { #define LAST_WORD ((TCB_SIZE / 4) - 1) uint64_t t1, t2; int flit_idx; MPASS(mask != 0); MPASS(word <= LAST_WORD); MPASS(shift < 32); flit_idx = (LAST_WORD - word) / 2; if (word & 0x1) shift += 32; t1 = be64toh(tcb[flit_idx]) >> shift; t2 = 0; if (fls(mask) > 64 - shift) { /* * Will spill over into the next logical flit, which is the flit * before this one. The flit_idx before this one must be valid. */ MPASS(flit_idx > 0); t2 = be64toh(tcb[flit_idx - 1]) << (64 - shift); } return ((t2 | t1) & mask); #undef LAST_WORD } #define GET_TCB_FIELD(tcb, F) \ get_tcb_field(tcb, W_TCB_##F, M_TCB_##F, S_TCB_##F) /* * Issues a CPL_GET_TCB to read the entire TCB for the tid. */ static int send_get_tcb(struct adapter *sc, u_int tid) { struct cpl_get_tcb *cpl; struct wrq_cookie cookie; MPASS(tid >= sc->tids.tid_base); MPASS(tid - sc->tids.tid_base < sc->tids.ntids); cpl = start_wrq_wr(&sc->sge.ctrlq[0], howmany(sizeof(*cpl), 16), &cookie); if (__predict_false(cpl == NULL)) return (ENOMEM); bzero(cpl, sizeof(*cpl)); INIT_TP_WR(cpl, tid); OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_GET_TCB, tid)); cpl->reply_ctrl = htobe16(V_REPLY_CHAN(0) | V_QUEUENO(sc->sge.ofld_rxq[0].iq.cntxt_id)); cpl->cookie = 0xff; commit_wrq_wr(&sc->sge.ctrlq[0], cpl, &cookie); return (0); } static struct tcb_histent * alloc_tcb_histent(struct adapter *sc, u_int tid, int flags) { struct tcb_histent *te; MPASS(flags == M_NOWAIT || flags == M_WAITOK); te = malloc(sizeof(*te), M_CXGBE, M_ZERO | flags); if (te == NULL) return (NULL); mtx_init(&te->te_lock, "TCB entry", NULL, MTX_DEF); callout_init_mtx(&te->te_callout, &te->te_lock, 0); te->te_adapter = sc; te->te_tid = tid; return (te); } static void free_tcb_histent(struct tcb_histent *te) { mtx_destroy(&te->te_lock); free(te, M_CXGBE); } /* * Start tracking the tid in the TCB history. */ int add_tid_to_history(struct adapter *sc, u_int tid) { struct tcb_histent *te = NULL; struct tom_data *td = sc->tom_softc; int rc; MPASS(tid >= sc->tids.tid_base); MPASS(tid - sc->tids.tid_base < sc->tids.ntids); if (td->tcb_history == NULL) return (ENXIO); rw_wlock(&td->tcb_history_lock); if (td->tcb_history[tid] != NULL) { rc = EEXIST; goto done; } te = alloc_tcb_histent(sc, tid, M_NOWAIT); if (te == NULL) { rc = ENOMEM; goto done; } mtx_lock(&te->te_lock); rc = send_get_tcb(sc, tid); if (rc == 0) { te->te_flags |= TE_RPL_PENDING; td->tcb_history[tid] = te; } else { free(te, M_CXGBE); } mtx_unlock(&te->te_lock); done: rw_wunlock(&td->tcb_history_lock); return (rc); } static void remove_tcb_histent(struct tcb_histent *te) { struct adapter *sc = te->te_adapter; struct tom_data *td = sc->tom_softc; rw_assert(&td->tcb_history_lock, RA_WLOCKED); mtx_assert(&te->te_lock, MA_OWNED); MPASS(td->tcb_history[te->te_tid] == te); td->tcb_history[te->te_tid] = NULL; free_tcb_histent(te); rw_wunlock(&td->tcb_history_lock); } static inline struct tcb_histent * lookup_tcb_histent(struct adapter *sc, u_int tid, bool addrem) { struct tcb_histent *te; struct tom_data *td = sc->tom_softc; MPASS(tid >= sc->tids.tid_base); MPASS(tid - sc->tids.tid_base < sc->tids.ntids); if (td->tcb_history == NULL) return (NULL); if (addrem) rw_wlock(&td->tcb_history_lock); else rw_rlock(&td->tcb_history_lock); te = td->tcb_history[tid]; if (te != NULL) { mtx_lock(&te->te_lock); return (te); /* with both locks held */ } if (addrem) rw_wunlock(&td->tcb_history_lock); else rw_runlock(&td->tcb_history_lock); return (te); } static inline void release_tcb_histent(struct tcb_histent *te) { struct adapter *sc = te->te_adapter; struct tom_data *td = sc->tom_softc; mtx_assert(&te->te_lock, MA_OWNED); mtx_unlock(&te->te_lock); rw_assert(&td->tcb_history_lock, RA_RLOCKED); rw_runlock(&td->tcb_history_lock); } static void request_tcb(void *arg) { struct tcb_histent *te = arg; mtx_assert(&te->te_lock, MA_OWNED); /* Noone else is supposed to update the histent. */ MPASS(!(te->te_flags & TE_RPL_PENDING)); if (send_get_tcb(te->te_adapter, te->te_tid) == 0) te->te_flags |= TE_RPL_PENDING; else callout_schedule(&te->te_callout, hz / 100); } static void update_tcb_histent(struct tcb_histent *te, const uint64_t *tcb) { struct tom_data *td = te->te_adapter->tom_softc; uint64_t tflags = get_tcb_tflags(tcb); uint8_t sample = 0; if (GET_TCB_FIELD(tcb, SND_MAX_RAW) != GET_TCB_FIELD(tcb, SND_UNA_RAW)) { if (GET_TCB_FIELD(tcb, T_RXTSHIFT) != 0) sample |= TS_RTO; if (GET_TCB_FIELD(tcb, T_DUPACKS) != 0) sample |= TS_DUPACKS; if (GET_TCB_FIELD(tcb, T_DUPACKS) >= td->dupack_threshold) sample |= TS_FASTREXMT; } if (GET_TCB_FIELD(tcb, SND_MAX_RAW) != 0) { uint32_t snd_wnd; sample |= TS_SND_BACKLOGGED; /* for whatever reason. */ snd_wnd = GET_TCB_FIELD(tcb, RCV_ADV); if (tflags & V_TF_RECV_SCALE(1)) snd_wnd <<= GET_TCB_FIELD(tcb, RCV_SCALE); if (GET_TCB_FIELD(tcb, SND_CWND) < snd_wnd) sample |= TS_CWND_LIMITED; /* maybe due to CWND */ } if (tflags & V_TF_CCTRL_ECN(1)) { /* * CE marker on incoming IP hdr, echoing ECE back in the TCP * hdr. Indicates congestion somewhere on the way from the peer * to this node. */ if (tflags & V_TF_CCTRL_ECE(1)) sample |= TS_ECN_ECE; /* * ECE seen and CWR sent (or about to be sent). Might indicate * congestion on the way to the peer. This node is reducing its * congestion window in response. */ if (tflags & (V_TF_CCTRL_CWR(1) | V_TF_CCTRL_RFR(1))) sample |= TS_ECN_CWR; } te->te_sample[te->te_pidx] = sample; if (++te->te_pidx == nitems(te->te_sample)) te->te_pidx = 0; memcpy(te->te_tcb, tcb, TCB_SIZE); te->te_flags |= TE_ACTIVE; } static int do_get_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_get_tcb_rpl *cpl = mtod(m, const void *); const uint64_t *tcb = (const uint64_t *)(const void *)(cpl + 1); struct tcb_histent *te; const u_int tid = GET_TID(cpl); bool remove; remove = GET_TCB_FIELD(tcb, T_STATE) == TCPS_CLOSED; te = lookup_tcb_histent(sc, tid, remove); if (te == NULL) { /* Not in the history. Who issued the GET_TCB for this? */ device_printf(sc->dev, "tcb %u: flags 0x%016jx, state %u, " "srtt %u, sscale %u, rscale %u, cookie 0x%x\n", tid, (uintmax_t)get_tcb_tflags(tcb), GET_TCB_FIELD(tcb, T_STATE), GET_TCB_FIELD(tcb, T_SRTT), GET_TCB_FIELD(tcb, SND_SCALE), GET_TCB_FIELD(tcb, RCV_SCALE), cpl->cookie); goto done; } MPASS(te->te_flags & TE_RPL_PENDING); te->te_flags &= ~TE_RPL_PENDING; if (remove) { remove_tcb_histent(te); } else { update_tcb_histent(te, tcb); callout_reset(&te->te_callout, hz / 10, request_tcb, te); release_tcb_histent(te); } done: m_freem(m); return (0); } static void fill_tcp_info_from_tcb(struct adapter *sc, uint64_t *tcb, struct tcp_info *ti) { uint32_t v; ti->tcpi_state = GET_TCB_FIELD(tcb, T_STATE); v = GET_TCB_FIELD(tcb, T_SRTT); ti->tcpi_rtt = tcp_ticks_to_us(sc, v); v = GET_TCB_FIELD(tcb, T_RTTVAR); ti->tcpi_rttvar = tcp_ticks_to_us(sc, v); ti->tcpi_snd_ssthresh = GET_TCB_FIELD(tcb, SND_SSTHRESH); ti->tcpi_snd_cwnd = GET_TCB_FIELD(tcb, SND_CWND); ti->tcpi_rcv_nxt = GET_TCB_FIELD(tcb, RCV_NXT); ti->tcpi_rcv_adv = GET_TCB_FIELD(tcb, RCV_ADV); ti->tcpi_dupacks = GET_TCB_FIELD(tcb, T_DUPACKS); v = GET_TCB_FIELD(tcb, TX_MAX); ti->tcpi_snd_nxt = v - GET_TCB_FIELD(tcb, SND_NXT_RAW); ti->tcpi_snd_una = v - GET_TCB_FIELD(tcb, SND_UNA_RAW); ti->tcpi_snd_max = v - GET_TCB_FIELD(tcb, SND_MAX_RAW); /* Receive window being advertised by us. */ ti->tcpi_rcv_wscale = GET_TCB_FIELD(tcb, SND_SCALE); /* Yes, SND. */ ti->tcpi_rcv_space = GET_TCB_FIELD(tcb, RCV_WND); /* Send window */ ti->tcpi_snd_wscale = GET_TCB_FIELD(tcb, RCV_SCALE); /* Yes, RCV. */ ti->tcpi_snd_wnd = GET_TCB_FIELD(tcb, RCV_ADV); if (get_tcb_tflags(tcb) & V_TF_RECV_SCALE(1)) ti->tcpi_snd_wnd <<= ti->tcpi_snd_wscale; else ti->tcpi_snd_wscale = 0; } static void fill_tcp_info_from_history(struct adapter *sc, struct tcb_histent *te, struct tcp_info *ti) { fill_tcp_info_from_tcb(sc, te->te_tcb, ti); } /* * Reads the TCB for the given tid using a memory window and copies it to 'buf' * in the same format as CPL_GET_TCB_RPL. */ static void read_tcb_using_memwin(struct adapter *sc, u_int tid, uint64_t *buf) { int i, j, k, rc; uint32_t addr; u_char *tcb, tmp; MPASS(tid >= sc->tids.tid_base); MPASS(tid - sc->tids.tid_base < sc->tids.ntids); addr = t4_read_reg(sc, A_TP_CMM_TCB_BASE) + tid * TCB_SIZE; rc = read_via_memwin(sc, 2, addr, (uint32_t *)buf, TCB_SIZE); if (rc != 0) return; tcb = (u_char *)buf; for (i = 0, j = TCB_SIZE - 16; i < j; i += 16, j -= 16) { for (k = 0; k < 16; k++) { tmp = tcb[i + k]; tcb[i + k] = tcb[j + k]; tcb[j + k] = tmp; } } } static void fill_tcp_info(struct adapter *sc, u_int tid, struct tcp_info *ti) { uint64_t tcb[TCB_SIZE / sizeof(uint64_t)]; struct tcb_histent *te; ti->tcpi_toe_tid = tid; te = lookup_tcb_histent(sc, tid, false); if (te != NULL) { fill_tcp_info_from_history(sc, te, ti); release_tcb_histent(te); } else { if (!(sc->debug_flags & DF_DISABLE_TCB_CACHE)) { /* XXX: tell firmware to flush TCB cache. */ } read_tcb_using_memwin(sc, tid, tcb); fill_tcp_info_from_tcb(sc, tcb, ti); } } /* * Called by the kernel to allow the TOE driver to "refine" values filled up in * the tcp_info for an offloaded connection. */ static void t4_tcp_info(struct toedev *tod, const struct tcpcb *tp, struct tcp_info *ti) { struct adapter *sc = tod->tod_softc; struct toepcb *toep = tp->t_toe; INP_LOCK_ASSERT(tptoinpcb(tp)); MPASS(ti != NULL); fill_tcp_info(sc, toep->tid, ti); } #ifdef KERN_TLS static int t4_alloc_tls_session(struct toedev *tod, struct tcpcb *tp, struct ktls_session *tls, int direction) { struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(tptoinpcb(tp)); MPASS(tls != NULL); return (tls_alloc_ktls(toep, tls, direction)); } #endif static void send_mss_flowc_wr(struct adapter *sc, struct toepcb *toep) { struct wrq_cookie cookie; struct fw_flowc_wr *flowc; struct ofld_tx_sdesc *txsd; const int flowclen = sizeof(*flowc) + sizeof(struct fw_flowc_mnemval); const int flowclen16 = howmany(flowclen, 16); if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0) { CH_ERR(sc, "%s: tid %u out of tx credits (%d, %d).\n", __func__, toep->tid, toep->tx_credits, toep->txsd_avail); return; } flowc = start_wrq_wr(&toep->ofld_txq->wrq, flowclen16, &cookie); if (__predict_false(flowc == NULL)) { CH_ERR(sc, "ENOMEM in %s for tid %u.\n", __func__, toep->tid); return; } flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | V_FW_FLOWC_WR_NPARAMS(1)); flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) | V_FW_WR_FLOWID(toep->tid)); flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_MSS; flowc->mnemval[0].val = htobe32(toep->params.emss); txsd = &toep->txsd[toep->txsd_pidx]; txsd->tx_credits = flowclen16; txsd->plen = 0; toep->tx_credits -= txsd->tx_credits; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) toep->txsd_pidx = 0; toep->txsd_avail--; commit_wrq_wr(&toep->ofld_txq->wrq, flowc, &cookie); } static void t4_pmtu_update(struct toedev *tod, struct tcpcb *tp, tcp_seq seq, int mtu) { struct work_request_hdr *wrh; struct ulp_txpkt *ulpmc; int idx, len; struct wrq_cookie cookie; struct inpcb *inp = tptoinpcb(tp); struct toepcb *toep = tp->t_toe; struct adapter *sc = td_adapter(toep->td); unsigned short *mtus = &sc->params.mtus[0]; INP_WLOCK_ASSERT(inp); MPASS(mtu > 0); /* kernel is supposed to provide something usable. */ /* tp->snd_una and snd_max are in host byte order too. */ seq = be32toh(seq); CTR6(KTR_CXGBE, "%s: tid %d, seq 0x%08x, mtu %u, mtu_idx %u (%d)", __func__, toep->tid, seq, mtu, toep->params.mtu_idx, mtus[toep->params.mtu_idx]); if (ulp_mode(toep) == ULP_MODE_NONE && /* XXX: Read TCB otherwise? */ (SEQ_LT(seq, tp->snd_una) || SEQ_GEQ(seq, tp->snd_max))) { CTR5(KTR_CXGBE, "%s: tid %d, seq 0x%08x not in range [0x%08x, 0x%08x).", __func__, toep->tid, seq, tp->snd_una, tp->snd_max); return; } /* Find the best mtu_idx for the suggested MTU. */ for (idx = 0; idx < NMTUS - 1 && mtus[idx + 1] <= mtu; idx++) continue; if (idx >= toep->params.mtu_idx) return; /* Never increase the PMTU (just like the kernel). */ /* * We'll send a compound work request with 2 SET_TCB_FIELDs -- the first * one updates the mtu_idx and the second one triggers a retransmit. */ len = sizeof(*wrh) + 2 * roundup2(LEN__SET_TCB_FIELD_ULP, 16); wrh = start_wrq_wr(toep->ctrlq, howmany(len, 16), &cookie); if (wrh == NULL) { CH_ERR(sc, "failed to change mtu_idx of tid %d (%u -> %u).\n", toep->tid, toep->params.mtu_idx, idx); return; } INIT_ULPTX_WRH(wrh, len, 1, 0); /* atomic */ ulpmc = (struct ulp_txpkt *)(wrh + 1); ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_T_MAXSEG, V_TCB_T_MAXSEG(M_TCB_T_MAXSEG), V_TCB_T_MAXSEG(idx)); ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_TIMESTAMP, V_TCB_TIMESTAMP(0x7FFFFULL << 11), 0); commit_wrq_wr(toep->ctrlq, wrh, &cookie); /* Update the software toepcb and tcpcb. */ toep->params.mtu_idx = idx; tp->t_maxseg = mtus[toep->params.mtu_idx]; if (inp->inp_inc.inc_flags & INC_ISIPV6) tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr); toep->params.emss = tp->t_maxseg; if (tp->t_flags & TF_RCVD_TSTMP) toep->params.emss -= TCPOLEN_TSTAMP_APPA; /* Update the firmware flowc. */ send_mss_flowc_wr(sc, toep); /* Update the MTU in the kernel's hostcache. */ if (sc->tt.update_hc_on_pmtu_change != 0) { struct in_conninfo inc = {0}; inc.inc_fibnum = inp->inp_inc.inc_fibnum; if (inp->inp_inc.inc_flags & INC_ISIPV6) { inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = inp->inp_inc.inc6_faddr; } else { inc.inc_faddr = inp->inp_inc.inc_faddr; } tcp_hc_updatemtu(&inc, mtu); } CTR6(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u), t_maxseg %u, emss %u", __func__, toep->tid, toep->params.mtu_idx, mtus[toep->params.mtu_idx], tp->t_maxseg, toep->params.emss); } /* * The TOE driver will not receive any more CPLs for the tid associated with the * toepcb; release the hold on the inpcb. */ void final_cpl_received(struct toepcb *toep) { struct inpcb *inp = toep->inp; bool need_wakeup; KASSERT(inp != NULL, ("%s: inp is NULL", __func__)); INP_WLOCK_ASSERT(inp); KASSERT(toep->flags & TPF_CPL_PENDING, ("%s: CPL not pending already?", __func__)); CTR6(KTR_CXGBE, "%s: tid %d, toep %p (0x%x), inp %p (0x%x)", __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags); if (ulp_mode(toep) == ULP_MODE_TCPDDP) release_ddp_resources(toep); toep->inp = NULL; need_wakeup = (toep->flags & TPF_WAITING_FOR_FINAL) != 0; toep->flags &= ~(TPF_CPL_PENDING | TPF_WAITING_FOR_FINAL); mbufq_drain(&toep->ulp_pduq); mbufq_drain(&toep->ulp_pdu_reclaimq); release_offload_resources(toep); if (!(toep->flags & TPF_ATTACHED)) done_with_toepcb(toep); if (!in_pcbrele_wlocked(inp)) INP_WUNLOCK(inp); if (need_wakeup) { struct mtx *lock = mtx_pool_find(mtxpool_sleep, toep); mtx_lock(lock); wakeup(toep); mtx_unlock(lock); } } void insert_tid(struct adapter *sc, int tid, void *ctx, int ntids) { struct tid_info *t = &sc->tids; MPASS(tid >= t->tid_base); MPASS(tid - t->tid_base < t->ntids); t->tid_tab[tid - t->tid_base] = ctx; atomic_add_int(&t->tids_in_use, ntids); } void * lookup_tid(struct adapter *sc, int tid) { struct tid_info *t = &sc->tids; return (t->tid_tab[tid - t->tid_base]); } void update_tid(struct adapter *sc, int tid, void *ctx) { struct tid_info *t = &sc->tids; t->tid_tab[tid - t->tid_base] = ctx; } void remove_tid(struct adapter *sc, int tid, int ntids) { struct tid_info *t = &sc->tids; t->tid_tab[tid - t->tid_base] = NULL; atomic_subtract_int(&t->tids_in_use, ntids); } /* * What mtu_idx to use, given a 4-tuple. Note that both s->mss and tcp_mssopt * have the MSS that we should advertise in our SYN. Advertised MSS doesn't * account for any TCP options so the effective MSS (only payload, no headers or * options) could be different. */ static int find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, struct offload_settings *s) { unsigned short *mtus = &sc->params.mtus[0]; int i, mss, mtu; MPASS(inc != NULL); mss = s->mss > 0 ? s->mss : tcp_mssopt(inc); if (inc->inc_flags & INC_ISIPV6) mtu = mss + sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else mtu = mss + sizeof(struct ip) + sizeof(struct tcphdr); for (i = 0; i < NMTUS - 1 && mtus[i + 1] <= mtu; i++) continue; return (i); } /* * Determine the receive window size for a socket. */ u_long select_rcv_wnd(struct socket *so) { unsigned long wnd; SOCKBUF_LOCK_ASSERT(&so->so_rcv); wnd = sbspace(&so->so_rcv); if (wnd < MIN_RCV_WND) wnd = MIN_RCV_WND; return min(wnd, MAX_RCV_WND); } int select_rcv_wscale(void) { int wscale = 0; unsigned long space = sb_max; if (space > MAX_RCV_WND) space = MAX_RCV_WND; while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space) wscale++; return (wscale); } __be64 calc_options0(struct vi_info *vi, struct conn_params *cp) { uint64_t opt0 = 0; opt0 |= F_TCAM_BYPASS; MPASS(cp->wscale >= 0 && cp->wscale <= M_WND_SCALE); opt0 |= V_WND_SCALE(cp->wscale); MPASS(cp->mtu_idx >= 0 && cp->mtu_idx < NMTUS); opt0 |= V_MSS_IDX(cp->mtu_idx); MPASS(cp->ulp_mode >= 0 && cp->ulp_mode <= M_ULP_MODE); opt0 |= V_ULP_MODE(cp->ulp_mode); MPASS(cp->opt0_bufsize >= 0 && cp->opt0_bufsize <= M_RCV_BUFSIZ); opt0 |= V_RCV_BUFSIZ(cp->opt0_bufsize); MPASS(cp->l2t_idx >= 0 && cp->l2t_idx < vi->adapter->vres.l2t.size); opt0 |= V_L2T_IDX(cp->l2t_idx); opt0 |= V_SMAC_SEL(vi->smt_idx); opt0 |= V_TX_CHAN(vi->pi->tx_chan); MPASS(cp->keepalive == 0 || cp->keepalive == 1); opt0 |= V_KEEP_ALIVE(cp->keepalive); MPASS(cp->nagle == 0 || cp->nagle == 1); opt0 |= V_NAGLE(cp->nagle); return (htobe64(opt0)); } __be32 calc_options2(struct vi_info *vi, struct conn_params *cp) { uint32_t opt2 = 0; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; /* * rx flow control, rx coalesce, congestion control, and tx pace are all * explicitly set by the driver. On T5+ the ISS is also set by the * driver to the value picked by the kernel. */ if (is_t4(sc)) { opt2 |= F_RX_FC_VALID | F_RX_COALESCE_VALID; opt2 |= F_CONG_CNTRL_VALID | F_PACE_VALID; } else { opt2 |= F_T5_OPT_2_VALID; /* all 4 valid */ opt2 |= F_T5_ISS; /* ISS provided in CPL */ } MPASS(cp->sack == 0 || cp->sack == 1); opt2 |= V_SACK_EN(cp->sack); MPASS(cp->tstamp == 0 || cp->tstamp == 1); opt2 |= V_TSTAMPS_EN(cp->tstamp); if (cp->wscale > 0) opt2 |= F_WND_SCALE_EN; MPASS(cp->ecn == 0 || cp->ecn == 1); opt2 |= V_CCTRL_ECN(cp->ecn); opt2 |= V_TX_QUEUE(TX_MODQ(pi->tx_chan)); opt2 |= V_PACE(0); opt2 |= F_RSS_QUEUE_VALID; opt2 |= V_RSS_QUEUE(sc->sge.ofld_rxq[cp->rxq_idx].iq.abs_id); if (chip_id(sc) <= CHELSIO_T6) { MPASS(pi->rx_chan == 0 || pi->rx_chan == 1); opt2 |= V_RX_CHANNEL(pi->rx_chan); } MPASS(cp->cong_algo >= 0 && cp->cong_algo <= M_CONG_CNTRL); opt2 |= V_CONG_CNTRL(cp->cong_algo); MPASS(cp->rx_coalesce == 0 || cp->rx_coalesce == 1); if (cp->rx_coalesce == 1) opt2 |= V_RX_COALESCE(M_RX_COALESCE); opt2 |= V_RX_FC_DDP(0) | V_RX_FC_DISABLE(0); MPASS(cp->ulp_mode != ULP_MODE_TCPDDP); return (htobe32(opt2)); } uint64_t select_ntuple(struct vi_info *vi, struct l2t_entry *e) { struct adapter *sc = vi->adapter; struct tp_params *tp = &sc->params.tp; uint64_t ntuple = 0; /* * Initialize each of the fields which we care about which are present * in the Compressed Filter Tuple. */ if (tp->vlan_shift >= 0 && EVL_VLANOFTAG(e->vlan) != CPL_L2T_VLAN_NONE) ntuple |= (uint64_t)(F_FT_VLAN_VLD | e->vlan) << tp->vlan_shift; if (tp->port_shift >= 0) ntuple |= (uint64_t)e->lport << tp->port_shift; if (tp->protocol_shift >= 0) ntuple |= (uint64_t)IPPROTO_TCP << tp->protocol_shift; if (tp->vnic_shift >= 0 && tp->vnic_mode == FW_VNIC_MODE_PF_VF) { ntuple |= (uint64_t)(V_FT_VNID_ID_VF(vi->vin) | V_FT_VNID_ID_PF(sc->pf) | V_FT_VNID_ID_VLD(vi->vfvld)) << tp->vnic_shift; } if (is_t4(sc)) return (htobe32((uint32_t)ntuple)); else return (htobe64(V_FILTER_TUPLE(ntuple))); } /* * Initialize various connection parameters. */ void init_conn_params(struct vi_info *vi , struct offload_settings *s, struct in_conninfo *inc, struct socket *so, const struct tcp_options *tcpopt, int16_t l2t_idx, struct conn_params *cp) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct tom_tunables *tt = &sc->tt; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); u_long wnd; u_int q_idx; MPASS(s->offload != 0); /* Congestion control algorithm */ if (s->cong_algo >= 0) cp->cong_algo = s->cong_algo & M_CONG_CNTRL; else if (sc->tt.cong_algorithm >= 0) cp->cong_algo = tt->cong_algorithm & M_CONG_CNTRL; else { struct cc_algo *cc = CC_ALGO(tp); if (strcasecmp(cc->name, "reno") == 0) cp->cong_algo = CONG_ALG_RENO; else if (strcasecmp(cc->name, "tahoe") == 0) cp->cong_algo = CONG_ALG_TAHOE; if (strcasecmp(cc->name, "newreno") == 0) cp->cong_algo = CONG_ALG_NEWRENO; if (strcasecmp(cc->name, "highspeed") == 0) cp->cong_algo = CONG_ALG_HIGHSPEED; else { /* * Use newreno in case the algorithm selected by the * host stack is not supported by the hardware. */ cp->cong_algo = CONG_ALG_NEWRENO; } } /* Tx traffic scheduling class. */ if (s->sched_class >= 0 && s->sched_class < sc->params.nsched_cls) cp->tc_idx = s->sched_class; else cp->tc_idx = -1; /* Nagle's algorithm. */ if (s->nagle >= 0) cp->nagle = s->nagle > 0 ? 1 : 0; else cp->nagle = tp->t_flags & TF_NODELAY ? 0 : 1; /* TCP Keepalive. */ if (V_tcp_always_keepalive || so_options_get(so) & SO_KEEPALIVE) cp->keepalive = 1; else cp->keepalive = 0; /* Optimization that's specific to T5 @ 40G. */ if (tt->tx_align >= 0) cp->tx_align = tt->tx_align > 0 ? 1 : 0; else if (chip_id(sc) == CHELSIO_T5 && (port_top_speed(pi) > 10 || sc->params.nports > 2)) cp->tx_align = 1; else cp->tx_align = 0; /* ULP mode. */ cp->ulp_mode = ULP_MODE_NONE; /* Rx coalescing. */ if (s->rx_coalesce >= 0) cp->rx_coalesce = s->rx_coalesce > 0 ? 1 : 0; else if (tt->rx_coalesce >= 0) cp->rx_coalesce = tt->rx_coalesce > 0 ? 1 : 0; else cp->rx_coalesce = 1; /* default */ /* * Index in the PMTU table. This controls the MSS that we announce in * our SYN initially, but after ESTABLISHED it controls the MSS that we * use to send data. */ cp->mtu_idx = find_best_mtu_idx(sc, inc, s); /* Tx queue for this connection. */ if (s->txq == QUEUE_RANDOM) q_idx = arc4random(); else if (s->txq == QUEUE_ROUNDROBIN) q_idx = atomic_fetchadd_int(&vi->txq_rr, 1); else q_idx = s->txq; cp->txq_idx = vi->first_ofld_txq + q_idx % vi->nofldtxq; /* Rx queue for this connection. */ if (s->rxq == QUEUE_RANDOM) q_idx = arc4random(); else if (s->rxq == QUEUE_ROUNDROBIN) q_idx = atomic_fetchadd_int(&vi->rxq_rr, 1); else q_idx = s->rxq; cp->rxq_idx = vi->first_ofld_rxq + q_idx % vi->nofldrxq; if (SOLISTENING(so)) { /* Passive open */ MPASS(tcpopt != NULL); /* TCP timestamp option */ if (tcpopt->tstamp && (s->tstamp > 0 || (s->tstamp < 0 && V_tcp_do_rfc1323))) cp->tstamp = 1; else cp->tstamp = 0; /* SACK */ if (tcpopt->sack && (s->sack > 0 || (s->sack < 0 && V_tcp_do_sack))) cp->sack = 1; else cp->sack = 0; /* Receive window scaling. */ if (tcpopt->wsf > 0 && tcpopt->wsf < 15 && V_tcp_do_rfc1323) cp->wscale = select_rcv_wscale(); else cp->wscale = 0; /* ECN */ if (tcpopt->ecn && /* XXX: review. */ (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn))) cp->ecn = 1; else cp->ecn = 0; wnd = max(so->sol_sbrcv_hiwat, MIN_RCV_WND); cp->opt0_bufsize = min(wnd >> 10, M_RCV_BUFSIZ); if (tt->sndbuf > 0) cp->sndbuf = tt->sndbuf; else if (so->sol_sbsnd_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf) cp->sndbuf = 256 * 1024; else cp->sndbuf = so->sol_sbsnd_hiwat; } else { /* Active open */ /* TCP timestamp option */ if (s->tstamp > 0 || (s->tstamp < 0 && (tp->t_flags & TF_REQ_TSTMP))) cp->tstamp = 1; else cp->tstamp = 0; /* SACK */ if (s->sack > 0 || (s->sack < 0 && (tp->t_flags & TF_SACK_PERMIT))) cp->sack = 1; else cp->sack = 0; /* Receive window scaling */ if (tp->t_flags & TF_REQ_SCALE) cp->wscale = select_rcv_wscale(); else cp->wscale = 0; /* ECN */ if (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn == 1)) cp->ecn = 1; else cp->ecn = 0; SOCKBUF_LOCK(&so->so_rcv); wnd = max(select_rcv_wnd(so), MIN_RCV_WND); SOCKBUF_UNLOCK(&so->so_rcv); cp->opt0_bufsize = min(wnd >> 10, M_RCV_BUFSIZ); if (tt->sndbuf > 0) cp->sndbuf = tt->sndbuf; else { SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf) cp->sndbuf = 256 * 1024; else cp->sndbuf = so->so_snd.sb_hiwat; SOCKBUF_UNLOCK(&so->so_snd); } } cp->l2t_idx = l2t_idx; /* This will be initialized on ESTABLISHED. */ cp->emss = 0; } int negative_advice(int status) { return (status == CPL_ERR_RTX_NEG_ADVICE || status == CPL_ERR_PERSIST_NEG_ADVICE || status == CPL_ERR_KEEPALV_NEG_ADVICE); } static int -alloc_tid_tab(struct tid_info *t, int flags) +alloc_tid_tab(struct adapter *sc) { + struct tid_info *t = &sc->tids; MPASS(t->ntids > 0); MPASS(t->tid_tab == NULL); t->tid_tab = malloc(t->ntids * sizeof(*t->tid_tab), M_CXGBE, - M_ZERO | flags); + M_ZERO | M_NOWAIT); if (t->tid_tab == NULL) return (ENOMEM); atomic_store_rel_int(&t->tids_in_use, 0); return (0); } static void -free_tid_tab(struct tid_info *t) +free_tid_tab(struct adapter *sc) { + struct tid_info *t = &sc->tids; KASSERT(t->tids_in_use == 0, ("%s: %d tids still in use.", __func__, t->tids_in_use)); free(t->tid_tab, M_CXGBE); t->tid_tab = NULL; } -static int -alloc_stid_tab(struct tid_info *t, int flags) -{ - - MPASS(t->nstids > 0); - MPASS(t->stid_tab == NULL); - - t->stid_tab = malloc(t->nstids * sizeof(*t->stid_tab), M_CXGBE, - M_ZERO | flags); - if (t->stid_tab == NULL) - return (ENOMEM); - mtx_init(&t->stid_lock, "stid lock", NULL, MTX_DEF); - t->stids_in_use = 0; - TAILQ_INIT(&t->stids); - t->nstids_free_head = t->nstids; - - return (0); -} - static void -free_stid_tab(struct tid_info *t) +free_tid_tabs(struct adapter *sc) { - - KASSERT(t->stids_in_use == 0, - ("%s: %d tids still in use.", __func__, t->stids_in_use)); - - if (mtx_initialized(&t->stid_lock)) - mtx_destroy(&t->stid_lock); - free(t->stid_tab, M_CXGBE); - t->stid_tab = NULL; -} - -static void -free_tid_tabs(struct tid_info *t) -{ - - free_tid_tab(t); - free_stid_tab(t); + free_tid_tab(sc); + free_stid_tab(sc); } static int -alloc_tid_tabs(struct tid_info *t) +alloc_tid_tabs(struct adapter *sc) { int rc; - rc = alloc_tid_tab(t, M_NOWAIT); + rc = alloc_tid_tab(sc); if (rc != 0) goto failed; - rc = alloc_stid_tab(t, M_NOWAIT); + rc = alloc_stid_tab(sc); if (rc != 0) goto failed; return (0); failed: - free_tid_tabs(t); + free_tid_tabs(sc); return (rc); } static inline void alloc_tcb_history(struct adapter *sc, struct tom_data *td) { if (sc->tids.ntids == 0 || sc->tids.ntids > 1024) return; rw_init(&td->tcb_history_lock, "TCB history"); td->tcb_history = malloc(sc->tids.ntids * sizeof(*td->tcb_history), M_CXGBE, M_ZERO | M_NOWAIT); td->dupack_threshold = G_DUPACKTHRESH(t4_read_reg(sc, A_TP_PARA_REG0)); } static inline void free_tcb_history(struct adapter *sc, struct tom_data *td) { #ifdef INVARIANTS int i; if (td->tcb_history != NULL) { for (i = 0; i < sc->tids.ntids; i++) { MPASS(td->tcb_history[i] == NULL); } } #endif free(td->tcb_history, M_CXGBE); if (rw_initialized(&td->tcb_history_lock)) rw_destroy(&td->tcb_history_lock); } static void free_tom_data(struct adapter *sc, struct tom_data *td) { ASSERT_SYNCHRONIZED_OP(sc); KASSERT(TAILQ_EMPTY(&td->toep_list), ("%s: TOE PCB list is not empty.", __func__)); KASSERT(td->lctx_count == 0, ("%s: lctx hash table is not empty.", __func__)); t4_free_ppod_region(&td->pr); if (td->listen_mask != 0) hashdestroy(td->listen_hash, M_CXGBE, td->listen_mask); if (mtx_initialized(&td->unsent_wr_lock)) mtx_destroy(&td->unsent_wr_lock); if (mtx_initialized(&td->lctx_hash_lock)) mtx_destroy(&td->lctx_hash_lock); if (mtx_initialized(&td->toep_list_lock)) mtx_destroy(&td->toep_list_lock); free_tcb_history(sc, td); - free_tid_tabs(&sc->tids); + free_tid_tabs(sc); free(td, M_CXGBE); } static char * prepare_pkt(int open_type, uint16_t vtag, struct inpcb *inp, int *pktlen, int *buflen) { char *pkt; struct tcphdr *th; int ipv6, len; const int maxlen = max(sizeof(struct ether_header), sizeof(struct ether_vlan_header)) + max(sizeof(struct ip), sizeof(struct ip6_hdr)) + sizeof(struct tcphdr); MPASS(open_type == OPEN_TYPE_ACTIVE || open_type == OPEN_TYPE_LISTEN); pkt = malloc(maxlen, M_CXGBE, M_ZERO | M_NOWAIT); if (pkt == NULL) return (NULL); ipv6 = inp->inp_vflag & INP_IPV6; len = 0; if (EVL_VLANOFTAG(vtag) == 0xfff) { struct ether_header *eh = (void *)pkt; if (ipv6) eh->ether_type = htons(ETHERTYPE_IPV6); else eh->ether_type = htons(ETHERTYPE_IP); len += sizeof(*eh); } else { struct ether_vlan_header *evh = (void *)pkt; evh->evl_encap_proto = htons(ETHERTYPE_VLAN); evh->evl_tag = htons(vtag); if (ipv6) evh->evl_proto = htons(ETHERTYPE_IPV6); else evh->evl_proto = htons(ETHERTYPE_IP); len += sizeof(*evh); } if (ipv6) { struct ip6_hdr *ip6 = (void *)&pkt[len]; ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_plen = htons(sizeof(struct tcphdr)); ip6->ip6_nxt = IPPROTO_TCP; if (open_type == OPEN_TYPE_ACTIVE) { ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = inp->in6p_faddr; } else if (open_type == OPEN_TYPE_LISTEN) { ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = ip6->ip6_src; } len += sizeof(*ip6); } else { struct ip *ip = (void *)&pkt[len]; ip->ip_v = IPVERSION; ip->ip_hl = sizeof(*ip) >> 2; ip->ip_tos = inp->inp_ip_tos; ip->ip_len = htons(sizeof(struct ip) + sizeof(struct tcphdr)); ip->ip_ttl = inp->inp_ip_ttl; ip->ip_p = IPPROTO_TCP; if (open_type == OPEN_TYPE_ACTIVE) { ip->ip_src = inp->inp_laddr; ip->ip_dst = inp->inp_faddr; } else if (open_type == OPEN_TYPE_LISTEN) { ip->ip_src = inp->inp_laddr; ip->ip_dst = ip->ip_src; } len += sizeof(*ip); } th = (void *)&pkt[len]; if (open_type == OPEN_TYPE_ACTIVE) { th->th_sport = inp->inp_lport; /* network byte order already */ th->th_dport = inp->inp_fport; /* ditto */ } else if (open_type == OPEN_TYPE_LISTEN) { th->th_sport = inp->inp_lport; /* network byte order already */ th->th_dport = th->th_sport; } len += sizeof(th); *pktlen = *buflen = len; return (pkt); } const struct offload_settings * lookup_offload_policy(struct adapter *sc, int open_type, struct mbuf *m, uint16_t vtag, struct inpcb *inp) { const struct t4_offload_policy *op; char *pkt; struct offload_rule *r; int i, matched, pktlen, buflen; static const struct offload_settings allow_offloading_settings = { .offload = 1, .rx_coalesce = -1, .cong_algo = -1, .sched_class = -1, .tstamp = -1, .sack = -1, .nagle = -1, .ecn = -1, .ddp = -1, .tls = -1, .txq = QUEUE_RANDOM, .rxq = QUEUE_RANDOM, .mss = -1, }; static const struct offload_settings disallow_offloading_settings = { .offload = 0, /* rest is irrelevant when offload is off. */ }; rw_assert(&sc->policy_lock, RA_LOCKED); /* * If there's no Connection Offloading Policy attached to the device * then we need to return a default static policy. If * "cop_managed_offloading" is true, then we need to disallow * offloading until a COP is attached to the device. Otherwise we * allow offloading ... */ op = sc->policy; if (op == NULL) { if (sc->tt.cop_managed_offloading) return (&disallow_offloading_settings); else return (&allow_offloading_settings); } switch (open_type) { case OPEN_TYPE_ACTIVE: case OPEN_TYPE_LISTEN: pkt = prepare_pkt(open_type, vtag, inp, &pktlen, &buflen); break; case OPEN_TYPE_PASSIVE: MPASS(m != NULL); pkt = mtod(m, char *); MPASS(*pkt == CPL_PASS_ACCEPT_REQ); pkt += sizeof(struct cpl_pass_accept_req); pktlen = m->m_pkthdr.len - sizeof(struct cpl_pass_accept_req); buflen = m->m_len - sizeof(struct cpl_pass_accept_req); break; default: MPASS(0); return (&disallow_offloading_settings); } if (pkt == NULL || pktlen == 0 || buflen == 0) return (&disallow_offloading_settings); matched = 0; r = &op->rule[0]; for (i = 0; i < op->nrules; i++, r++) { if (r->open_type != open_type && r->open_type != OPEN_TYPE_DONTCARE) { continue; } matched = bpf_filter(r->bpf_prog.bf_insns, pkt, pktlen, buflen); if (matched) break; } if (open_type == OPEN_TYPE_ACTIVE || open_type == OPEN_TYPE_LISTEN) free(pkt, M_CXGBE); return (matched ? &r->settings : &disallow_offloading_settings); } static void reclaim_wr_resources(void *arg, int count) { struct tom_data *td = arg; STAILQ_HEAD(, wrqe) twr_list = STAILQ_HEAD_INITIALIZER(twr_list); struct cpl_act_open_req *cpl; u_int opcode, atid, tid; struct wrqe *wr; struct adapter *sc = td_adapter(td); mtx_lock(&td->unsent_wr_lock); STAILQ_SWAP(&td->unsent_wr_list, &twr_list, wrqe); mtx_unlock(&td->unsent_wr_lock); while ((wr = STAILQ_FIRST(&twr_list)) != NULL) { STAILQ_REMOVE_HEAD(&twr_list, link); cpl = wrtod(wr); opcode = GET_OPCODE(cpl); switch (opcode) { case CPL_ACT_OPEN_REQ: case CPL_ACT_OPEN_REQ6: atid = G_TID_TID(be32toh(OPCODE_TID(cpl))); CTR2(KTR_CXGBE, "%s: atid %u ", __func__, atid); - act_open_failure_cleanup(sc, atid, EHOSTUNREACH); + act_open_failure_cleanup(sc, lookup_atid(sc, atid), + EHOSTUNREACH); free(wr, M_CXGBE); break; case CPL_PASS_ACCEPT_RPL: tid = GET_TID(cpl); CTR2(KTR_CXGBE, "%s: tid %u ", __func__, tid); - synack_failure_cleanup(sc, tid); + synack_failure_cleanup(sc, lookup_tid(sc, tid)); free(wr, M_CXGBE); break; default: log(LOG_ERR, "%s: leaked work request %p, wr_len %d, " "opcode %x\n", __func__, wr, wr->wr_len, opcode); /* WR not freed here; go look at it with a debugger. */ } } } +/* + * Based on do_abort_req. We treat an abrupt hardware stop as a connection + * abort from the hardware. + */ +static void +live_tid_failure_cleanup(struct adapter *sc, struct toepcb *toep, u_int status) +{ + struct inpcb *inp; + struct tcpcb *tp; + struct epoch_tracker et; + + MPASS(!(toep->flags & TPF_SYNQE)); + + inp = toep->inp; + CURVNET_SET(toep->vnet); + NET_EPOCH_ENTER(et); /* for tcp_close */ + INP_WLOCK(inp); + tp = intotcpcb(inp); + toep->flags |= TPF_ABORT_SHUTDOWN; + if ((inp->inp_flags & INP_DROPPED) == 0) { + struct socket *so = inp->inp_socket; + + if (so != NULL) + so_error_set(so, status); + tp = tcp_close(tp); + if (tp == NULL) + INP_WLOCK(inp); /* re-acquire */ + } + final_cpl_received(toep); + NET_EPOCH_EXIT(et); + CURVNET_RESTORE(); +} + +static void +cleanup_stranded_tids(void *arg, int count) +{ + TAILQ_HEAD(, toepcb) tlist = TAILQ_HEAD_INITIALIZER(tlist); + TAILQ_HEAD(, synq_entry) slist = TAILQ_HEAD_INITIALIZER(slist); + struct tom_data *td = arg; + struct adapter *sc = td_adapter(td); + struct toepcb *toep; + struct synq_entry *synqe; + + /* Clean up synq entries. */ + mtx_lock(&td->toep_list_lock); + TAILQ_SWAP(&td->stranded_synqe, &slist, synq_entry, link); + mtx_unlock(&td->toep_list_lock); + while ((synqe = TAILQ_FIRST(&slist)) != NULL) { + TAILQ_REMOVE(&slist, synqe, link); + MPASS(synqe->tid >= 0); /* stale, was kept around for debug */ + synqe->tid = -1; + synack_failure_cleanup(sc, synqe); + } + + /* Clean up in-flight active opens. */ + mtx_lock(&td->toep_list_lock); + TAILQ_SWAP(&td->stranded_atids, &tlist, toepcb, link); + mtx_unlock(&td->toep_list_lock); + while ((toep = TAILQ_FIRST(&tlist)) != NULL) { + TAILQ_REMOVE(&tlist, toep, link); + MPASS(toep->tid >= 0); /* stale, was kept around for debug */ + toep->tid = -1; + act_open_failure_cleanup(sc, toep, EHOSTUNREACH); + } + + /* Clean up live connections. */ + mtx_lock(&td->toep_list_lock); + TAILQ_SWAP(&td->stranded_tids, &tlist, toepcb, link); + mtx_unlock(&td->toep_list_lock); + while ((toep = TAILQ_FIRST(&tlist)) != NULL) { + TAILQ_REMOVE(&tlist, toep, link); + MPASS(toep->tid >= 0); /* stale, was kept around for debug */ + toep->tid = -1; + live_tid_failure_cleanup(sc, toep, ECONNABORTED); + } +} + /* * Ground control to Major TOM * Commencing countdown, engines on */ static int t4_tom_activate(struct adapter *sc) { struct tom_data *td; struct toedev *tod; struct vi_info *vi; int i, rc, v; ASSERT_SYNCHRONIZED_OP(sc); /* per-adapter softc for TOM */ td = malloc(sizeof(*td), M_CXGBE, M_ZERO | M_NOWAIT); if (td == NULL) return (ENOMEM); /* List of TOE PCBs and associated lock */ mtx_init(&td->toep_list_lock, "PCB list lock", NULL, MTX_DEF); TAILQ_INIT(&td->toep_list); TAILQ_INIT(&td->synqe_list); + TAILQ_INIT(&td->stranded_atids); + TAILQ_INIT(&td->stranded_tids); + TASK_INIT(&td->cleanup_stranded_tids, 0, cleanup_stranded_tids, td); /* Listen context */ mtx_init(&td->lctx_hash_lock, "lctx hash lock", NULL, MTX_DEF); td->listen_hash = hashinit_flags(LISTEN_HASH_SIZE, M_CXGBE, &td->listen_mask, HASH_NOWAIT); /* List of WRs for which L2 resolution failed */ mtx_init(&td->unsent_wr_lock, "Unsent WR list lock", NULL, MTX_DEF); STAILQ_INIT(&td->unsent_wr_list); TASK_INIT(&td->reclaim_wr_resources, 0, reclaim_wr_resources, td); /* TID tables */ - rc = alloc_tid_tabs(&sc->tids); + rc = alloc_tid_tabs(sc); if (rc != 0) goto done; rc = t4_init_ppod_region(&td->pr, &sc->vres.ddp, t4_read_reg(sc, A_ULP_RX_TDDP_PSZ), "TDDP page pods"); if (rc != 0) goto done; t4_set_reg_field(sc, A_ULP_RX_TDDP_TAGMASK, V_TDDPTAGMASK(M_TDDPTAGMASK), td->pr.pr_tag_mask); alloc_tcb_history(sc, td); /* toedev ops */ tod = &td->tod; init_toedev(tod); tod->tod_softc = sc; tod->tod_connect = t4_connect; tod->tod_listen_start = t4_listen_start; tod->tod_listen_stop = t4_listen_stop; tod->tod_rcvd = t4_rcvd; tod->tod_output = t4_tod_output; tod->tod_send_rst = t4_send_rst; tod->tod_send_fin = t4_send_fin; tod->tod_pcb_detach = t4_pcb_detach; tod->tod_l2_update = t4_l2_update; tod->tod_syncache_added = t4_syncache_added; tod->tod_syncache_removed = t4_syncache_removed; tod->tod_syncache_respond = t4_syncache_respond; tod->tod_offload_socket = t4_offload_socket; tod->tod_ctloutput = t4_ctloutput; tod->tod_tcp_info = t4_tcp_info; #ifdef KERN_TLS tod->tod_alloc_tls_session = t4_alloc_tls_session; #endif tod->tod_pmtu_update = t4_pmtu_update; for_each_port(sc, i) { for_each_vi(sc->port[i], v, vi) { SETTOEDEV(vi->ifp, &td->tod); } } sc->tom_softc = td; register_toedev(sc->tom_softc); done: if (rc != 0) free_tom_data(sc, td); return (rc); } static int t4_tom_deactivate(struct adapter *sc) { int rc = 0; struct tom_data *td = sc->tom_softc; ASSERT_SYNCHRONIZED_OP(sc); if (td == NULL) return (0); /* XXX. KASSERT? */ if (sc->offload_map != 0) return (EBUSY); /* at least one port has IFCAP_TOE enabled */ if (uld_active(sc, ULD_IWARP) || uld_active(sc, ULD_ISCSI)) return (EBUSY); /* both iWARP and iSCSI rely on the TOE. */ mtx_lock(&td->toep_list_lock); if (!TAILQ_EMPTY(&td->toep_list)) rc = EBUSY; MPASS(TAILQ_EMPTY(&td->synqe_list)); + MPASS(TAILQ_EMPTY(&td->stranded_tids)); + mtx_unlock(&td->toep_list_lock); mtx_unlock(&td->toep_list_lock); mtx_lock(&td->lctx_hash_lock); if (td->lctx_count > 0) rc = EBUSY; mtx_unlock(&td->lctx_hash_lock); taskqueue_drain(taskqueue_thread, &td->reclaim_wr_resources); + taskqueue_drain(taskqueue_thread, &td->cleanup_stranded_tids); mtx_lock(&td->unsent_wr_lock); if (!STAILQ_EMPTY(&td->unsent_wr_list)) rc = EBUSY; mtx_unlock(&td->unsent_wr_lock); if (rc == 0) { unregister_toedev(sc->tom_softc); free_tom_data(sc, td); sc->tom_softc = NULL; } return (rc); } +static void +stop_atids(struct adapter *sc) +{ + struct tom_data *td = sc->tom_softc; + struct tid_info *t = &sc->tids; + struct toepcb *toep; + int atid; + + /* + * Hashfilters and T6-KTLS are the only other users of atids but they're + * both mutually exclusive with TOE. That means t4_tom owns all the + * atids in the table. + */ + MPASS(!is_hashfilter(sc)); + if (is_t6(sc)) + MPASS(!(sc->flags & KERN_TLS_ON)); + + /* New atids are not being allocated. */ +#ifdef INVARIANTS + mtx_lock(&t->atid_lock); + MPASS(t->atid_alloc_stopped == true); + mtx_unlock(&t->atid_lock); +#endif + + /* + * In-use atids fall in one of these two categories: + * a) Those waiting for L2 resolution before being submitted to + * hardware. + * b) Those that have been submitted to hardware and are awaiting + * replies that will never arrive because the LLD is stopped. + */ + for (atid = 0; atid < t->natids; atid++) { + toep = lookup_atid(sc, atid); + if ((uintptr_t)toep >= (uintptr_t)&t->atid_tab[0] && + (uintptr_t)toep < (uintptr_t)&t->atid_tab[t->natids]) + continue; + MPASS(toep->tid == atid); + MPASS(toep->incarnation == sc->incarnation); + /* + * Take the atid out of the lookup table. toep->tid is stale + * after this but useful for debug. + */ + CTR(KTR_CXGBE, "%s: atid %d@%d STRANDED, removed from table", + __func__, atid, toep->incarnation); + free_atid(sc, toep->tid); +#if 0 + toep->tid = -1; +#endif + mtx_lock(&td->toep_list_lock); + TAILQ_INSERT_TAIL(&td->stranded_atids, toep, link); + mtx_unlock(&td->toep_list_lock); + } + MPASS(atomic_load_int(&t->atids_in_use) == 0); +} + +static void +stop_tids(struct adapter *sc) +{ + struct tom_data *td = sc->tom_softc; + struct toepcb *toep; +#ifdef INVARIANTS + struct tid_info *t = &sc->tids; +#endif + + /* + * The LLD's offload queues are stopped so do_act_establish and + * do_pass_accept_req cannot run and insert tids in parallel with this + * thread. stop_stid_tab has also run and removed the synq entries' + * tids from the table. The only tids in the table are for connections + * at or beyond ESTABLISHED that are still waiting for the final CPL. + */ + mtx_lock(&td->toep_list_lock); + TAILQ_FOREACH(toep, &td->toep_list, link) { + MPASS(sc->incarnation == toep->incarnation); + MPASS(toep->tid >= 0); + MPASS(toep == lookup_tid(sc, toep->tid)); + /* Remove tid from the lookup table immediately. */ + CTR(KTR_CXGBE, "%s: tid %d@%d STRANDED, removed from table", + __func__, toep->tid, toep->incarnation); + remove_tid(sc, toep->tid, toep->ce ? 2 : 1); +#if 0 + /* toep->tid is stale now but left alone for debug. */ + toep->tid = -1; +#endif + /* All toep in this list will get bulk moved to stranded_tids */ + toep->flags &= ~TPF_IN_TOEP_LIST; + } + MPASS(TAILQ_EMPTY(&td->stranded_tids)); + TAILQ_CONCAT(&td->stranded_tids, &td->toep_list, link); + MPASS(TAILQ_EMPTY(&td->toep_list)); + mtx_unlock(&td->toep_list_lock); + + MPASS(atomic_load_int(&t->tids_in_use) == 0); +} + +/* + * L2T is stable because + * 1. stop_lld stopped all new allocations. + * 2. stop_lld also stopped the tx wrq so nothing is enqueueing new WRs to the + * queue or to l2t_entry->wr_list. + * 3. t4_l2t_update is ignoring all L2 updates. + */ +static void +stop_tom_l2t(struct adapter *sc) +{ + struct l2t_data *d = sc->l2t; + struct l2t_entry *e; + int i; + + for (i = 0; i < d->l2t_size; i++) { + e = &d->l2tab[i]; + mtx_lock(&e->lock); + if (e->state == L2T_STATE_VALID) + e->state = L2T_STATE_RESOLVING; + if (!STAILQ_EMPTY(&e->wr_list)) + CXGBE_UNIMPLEMENTED("l2t e->wr_list"); + mtx_unlock(&e->lock); + } +} + +static int +t4_tom_stop(struct adapter *sc) +{ + struct tid_info *t = &sc->tids; + struct tom_data *td = sc->tom_softc; + + ASSERT_SYNCHRONIZED_OP(sc); + + stop_tom_l2t(sc); + if (atomic_load_int(&t->atids_in_use) > 0) + stop_atids(sc); + if (atomic_load_int(&t->stids_in_use) > 0) + stop_stid_tab(sc); + if (atomic_load_int(&t->tids_in_use) > 0) + stop_tids(sc); + taskqueue_enqueue(taskqueue_thread, &td->cleanup_stranded_tids); + + return (0); +} + +static int +t4_tom_restart(struct adapter *sc) +{ + ASSERT_SYNCHRONIZED_OP(sc); + + restart_stid_tab(sc); + + return (0); +} + static int t4_ctloutput_tom(struct socket *so, struct sockopt *sopt) { struct tcpcb *tp = sototcpcb(so); struct toepcb *toep = tp->t_toe; int error, optval; if (sopt->sopt_level == IPPROTO_TCP && sopt->sopt_name == TCP_USE_DDP) { if (sopt->sopt_dir != SOPT_SET) return (EOPNOTSUPP); if (sopt->sopt_td != NULL) { /* Only settable by the kernel. */ return (EPERM); } error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) return (error); if (optval != 0) return (t4_enable_ddp_rcv(so, toep)); else return (EOPNOTSUPP); } return (tcp_ctloutput(so, sopt)); } static int t4_aio_queue_tom(struct socket *so, struct kaiocb *job) { struct tcpcb *tp = sototcpcb(so); struct toepcb *toep = tp->t_toe; int error; /* * No lock is needed as TOE sockets never change between * active and passive. */ if (SOLISTENING(so)) return (EINVAL); if (ulp_mode(toep) == ULP_MODE_TCPDDP || ulp_mode(toep) == ULP_MODE_NONE) { error = t4_aio_queue_ddp(so, job); if (error != EOPNOTSUPP) return (error); } return (t4_aio_queue_aiotx(so, job)); } static int t4_tom_mod_load(void) { /* CPL handlers */ t4_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl); t4_register_shared_cpl_handler(CPL_L2T_WRITE_RPL, do_l2t_write_rpl2, CPL_COOKIE_TOM); t4_init_connect_cpl_handlers(); t4_init_listen_cpl_handlers(); t4_init_cpl_io_handlers(); t4_ddp_mod_load(); t4_tls_mod_load(); bcopy(&tcp_protosw, &toe_protosw, sizeof(toe_protosw)); toe_protosw.pr_ctloutput = t4_ctloutput_tom; toe_protosw.pr_aio_queue = t4_aio_queue_tom; bcopy(&tcp6_protosw, &toe6_protosw, sizeof(toe6_protosw)); toe6_protosw.pr_ctloutput = t4_ctloutput_tom; toe6_protosw.pr_aio_queue = t4_aio_queue_tom; return (t4_register_uld(&tom_uld_info, ULD_TOM)); } static void tom_uninit(struct adapter *sc, void *arg __unused) { if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4tomun")) return; /* Try to free resources (works only if no port has IFCAP_TOE) */ if (uld_active(sc, ULD_TOM)) t4_deactivate_uld(sc, ULD_TOM); end_synchronized_op(sc, 0); } static int t4_tom_mod_unload(void) { t4_iterate(tom_uninit, NULL); if (t4_unregister_uld(&tom_uld_info, ULD_TOM) == EBUSY) return (EBUSY); t4_tls_mod_unload(); t4_ddp_mod_unload(); t4_uninit_connect_cpl_handlers(); t4_uninit_listen_cpl_handlers(); t4_uninit_cpl_io_handlers(); t4_register_shared_cpl_handler(CPL_L2T_WRITE_RPL, NULL, CPL_COOKIE_TOM); t4_register_cpl_handler(CPL_GET_TCB_RPL, NULL); return (0); } #endif /* TCP_OFFLOAD */ static int t4_tom_modevent(module_t mod, int cmd, void *arg) { int rc = 0; #ifdef TCP_OFFLOAD switch (cmd) { case MOD_LOAD: rc = t4_tom_mod_load(); break; case MOD_UNLOAD: rc = t4_tom_mod_unload(); break; default: rc = EINVAL; } #else printf("t4_tom: compiled without TCP_OFFLOAD support.\n"); rc = EOPNOTSUPP; #endif return (rc); } static moduledata_t t4_tom_moddata= { "t4_tom", t4_tom_modevent, 0 }; MODULE_VERSION(t4_tom, 1); MODULE_DEPEND(t4_tom, toecore, 1, 1, 1); MODULE_DEPEND(t4_tom, t4nex, 1, 1, 1); DECLARE_MODULE(t4_tom, t4_tom_moddata, SI_SUB_EXEC, SI_ORDER_ANY); diff --git a/sys/dev/cxgbe/tom/t4_tom.h b/sys/dev/cxgbe/tom/t4_tom.h index 0bc368fe3d56..8debab4940b1 100644 --- a/sys/dev/cxgbe/tom/t4_tom.h +++ b/sys/dev/cxgbe/tom/t4_tom.h @@ -1,565 +1,578 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2012, 2015 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #ifndef __T4_TOM_H__ #define __T4_TOM_H__ #include #include "common/t4_hw.h" #include "common/t4_msg.h" #include "tom/t4_tls.h" #define LISTEN_HASH_SIZE 32 /* * Min receive window. We want it to be large enough to accommodate receive * coalescing, handle jumbo frames, and not trigger sender SWS avoidance. */ #define MIN_RCV_WND (24 * 1024U) /* * Max receive window supported by HW in bytes. Only a small part of it can * be set through option0, the rest needs to be set through RX_DATA_ACK. */ #define MAX_RCV_WND ((1U << 27) - 1) #define DDP_RSVD_WIN (16 * 1024U) #define SB_DDP_INDICATE SB_IN_TOE /* soreceive must respond to indicate */ #define USE_DDP_RX_FLOW_CONTROL #define PPOD_SZ(n) ((n) * sizeof(struct pagepod)) #define PPOD_SIZE (PPOD_SZ(1)) /* TOE PCB flags */ enum { TPF_ATTACHED = (1 << 0), /* a tcpcb refers to this toepcb */ TPF_FLOWC_WR_SENT = (1 << 1), /* firmware flow context WR sent */ TPF_TX_DATA_SENT = (1 << 2), /* some data sent */ TPF_TX_SUSPENDED = (1 << 3), /* tx suspended for lack of resources */ TPF_SEND_FIN = (1 << 4), /* send FIN after all pending data */ TPF_FIN_SENT = (1 << 5), /* FIN has been sent */ TPF_ABORT_SHUTDOWN = (1 << 6), /* connection abort is in progress */ TPF_CPL_PENDING = (1 << 7), /* haven't received the last CPL */ TPF_SYNQE = (1 << 8), /* synq_entry, not really a toepcb */ TPF_SYNQE_EXPANDED = (1 << 9), /* toepcb ready, tid context updated */ TPF_TLS_STARTING = (1 << 10), /* starting TLS receive */ TPF_KTLS = (1 << 11), /* send TLS records from KTLS */ TPF_INITIALIZED = (1 << 12), /* init_toepcb has been called */ TPF_TLS_RECEIVE = (1 << 13), /* should receive TLS records */ TPF_TLS_RX_QUIESCING = (1 << 14), /* RX quiesced for TLS RX startup */ TPF_TLS_RX_QUIESCED = (1 << 15), /* RX quiesced for TLS RX startup */ TPF_WAITING_FOR_FINAL = (1<< 16), /* waiting for wakeup on final CPL */ + TPF_IN_TOEP_LIST = (1 << 17), /* toep is in the main td->toep_list */ }; enum { DDP_OK = (1 << 0), /* OK to turn on DDP */ DDP_SC_REQ = (1 << 1), /* state change (on/off) requested */ DDP_ON = (1 << 2), /* DDP is turned on */ DDP_BUF0_ACTIVE = (1 << 3), /* buffer 0 in use (not invalidated) */ DDP_BUF1_ACTIVE = (1 << 4), /* buffer 1 in use (not invalidated) */ DDP_TASK_ACTIVE = (1 << 5), /* requeue task is queued / running */ DDP_DEAD = (1 << 6), /* toepcb is shutting down */ DDP_AIO = (1 << 7), /* DDP used for AIO, not so_rcv */ DDP_RCVBUF = (1 << 8), /* DDP used for so_rcv, not AIO */ }; struct bio; struct ctl_sg_entry; struct sockopt; struct offload_settings; /* * Connection parameters for an offloaded connection. These are mostly (but not * all) hardware TOE parameters. */ struct conn_params { int8_t rx_coalesce; int8_t cong_algo; int8_t tc_idx; int8_t tstamp; int8_t sack; int8_t nagle; int8_t keepalive; int8_t wscale; int8_t ecn; int8_t mtu_idx; int8_t ulp_mode; int8_t tx_align; int16_t txq_idx; /* ofld_txq = &sc->sge.ofld_txq[txq_idx] */ int16_t rxq_idx; /* ofld_rxq = &sc->sge.ofld_rxq[rxq_idx] */ int16_t l2t_idx; uint16_t emss; uint16_t opt0_bufsize; u_int sndbuf; /* controls TP tx pages */ }; struct ofld_tx_sdesc { uint32_t plen; /* payload length */ uint8_t tx_credits; /* firmware tx credits (unit is 16B) */ }; struct ppod_region { u_int pr_start; u_int pr_len; u_int pr_page_shift[4]; uint32_t pr_tag_mask; /* hardware tagmask for this region. */ uint32_t pr_invalid_bit; /* OR with this to invalidate tag. */ uint32_t pr_alias_mask; /* AND with tag to get alias bits. */ u_int pr_alias_shift; /* shift this much for first alias bit. */ vmem_t *pr_arena; }; struct ppod_reservation { struct ppod_region *prsv_pr; uint32_t prsv_tag; /* Full tag: pgsz, alias, tag, color */ u_int prsv_nppods; }; struct pageset { TAILQ_ENTRY(pageset) link; vm_page_t *pages; int npages; int flags; int offset; /* offset in first page */ int len; struct ppod_reservation prsv; struct vmspace *vm; vm_offset_t start; u_int vm_timestamp; }; TAILQ_HEAD(pagesetq, pageset); #define PS_PPODS_WRITTEN 0x0001 /* Page pods written to the card. */ struct ddp_rcv_buffer { TAILQ_ENTRY(ddp_rcv_buffer) link; void *buf; struct ppod_reservation prsv; size_t len; u_int refs; }; struct ddp_buffer { union { /* DDP_AIO fields */ struct { struct pageset *ps; struct kaiocb *job; int cancel_pending; }; /* DDP_RCVBUF fields */ struct { struct ddp_rcv_buffer *drb; uint32_t placed; }; }; }; /* * (a) - DDP_AIO only * (r) - DDP_RCVBUF only */ struct ddp_pcb { struct mtx lock; u_int flags; int active_id; /* the currently active DDP buffer */ struct ddp_buffer db[2]; union { TAILQ_HEAD(, pageset) cached_pagesets; /* (a) */ TAILQ_HEAD(, ddp_rcv_buffer) cached_buffers; /* (r) */ }; TAILQ_HEAD(, kaiocb) aiojobq; /* (a) */ u_int waiting_count; /* (a) */ u_int active_count; u_int cached_count; struct task requeue_task; struct kaiocb *queueing; /* (a) */ struct mtx cache_lock; /* (r) */ }; struct toepcb { struct tom_data *td; struct inpcb *inp; /* backpointer to host stack's PCB */ u_int flags; /* miscellaneous flags */ - TAILQ_ENTRY(toepcb) link; /* toep_list */ + TAILQ_ENTRY(toepcb) link; /* toep_list or stranded_toep_list */ int refcount; struct vnet *vnet; struct vi_info *vi; /* virtual interface */ struct sge_ofld_txq *ofld_txq; struct sge_ofld_rxq *ofld_rxq; struct sge_wrq *ctrlq; struct l2t_entry *l2te; /* L2 table entry used by this connection */ struct clip_entry *ce; /* CLIP table entry used by this tid */ int tid; /* Connection identifier */ + int incarnation; /* sc->incarnation when toepcb was allocated */ /* tx credit handling */ u_int tx_total; /* total tx WR credits (in 16B units) */ u_int tx_credits; /* tx WR credits (in 16B units) available */ u_int tx_nocompl; /* tx WR credits since last compl request */ u_int plen_nocompl; /* payload since last compl request */ struct conn_params params; void *ulpcb; void *ulpcb2; struct mbufq ulp_pduq; /* PDUs waiting to be sent out. */ struct mbufq ulp_pdu_reclaimq; struct ddp_pcb ddp; struct tls_ofld_info tls; TAILQ_HEAD(, kaiocb) aiotx_jobq; struct task aiotx_task; struct socket *aiotx_so; /* Tx software descriptor */ uint8_t txsd_total; uint8_t txsd_pidx; uint8_t txsd_cidx; uint8_t txsd_avail; struct ofld_tx_sdesc txsd[]; }; static inline int ulp_mode(struct toepcb *toep) { return (toep->params.ulp_mode); } #define DDP_LOCK(toep) mtx_lock(&(toep)->ddp.lock) #define DDP_UNLOCK(toep) mtx_unlock(&(toep)->ddp.lock) #define DDP_ASSERT_LOCKED(toep) mtx_assert(&(toep)->ddp.lock, MA_OWNED) #define DDP_CACHE_LOCK(toep) mtx_lock(&(toep)->ddp.cache_lock) #define DDP_CACHE_UNLOCK(toep) mtx_unlock(&(toep)->ddp.cache_lock) /* * Compressed state for embryonic connections for a listener. */ struct synq_entry { struct listen_ctx *lctx; /* backpointer to listen ctx */ struct mbuf *syn; int flags; /* same as toepcb's tp_flags */ TAILQ_ENTRY(synq_entry) link; /* synqe_list */ volatile int ok_to_respond; volatile u_int refcnt; int tid; uint32_t iss; uint32_t irs; uint32_t ts; uint32_t rss_hash; __be16 tcp_opt; /* from cpl_pass_establish */ + int incarnation; struct toepcb *toep; struct conn_params params; }; /* listen_ctx flags */ #define LCTX_RPL_PENDING 1 /* waiting for a CPL_PASS_OPEN_RPL */ +#define LCTX_SETUP_IN_HW 2 /* stid entry is setup in hardware */ struct listen_ctx { LIST_ENTRY(listen_ctx) link; /* listen hash linkage */ volatile int refcount; int stid; struct stid_region stid_region; int flags; struct inpcb *inp; /* listening socket's inp */ struct vnet *vnet; struct sge_wrq *ctrlq; struct sge_ofld_rxq *ofld_rxq; struct clip_entry *ce; }; /* tcb_histent flags */ #define TE_RPL_PENDING 1 #define TE_ACTIVE 2 /* bits in one 8b tcb_histent sample. */ #define TS_RTO (1 << 0) #define TS_DUPACKS (1 << 1) #define TS_FASTREXMT (1 << 2) #define TS_SND_BACKLOGGED (1 << 3) #define TS_CWND_LIMITED (1 << 4) #define TS_ECN_ECE (1 << 5) #define TS_ECN_CWR (1 << 6) #define TS_RESERVED (1 << 7) /* Unused. */ struct tcb_histent { struct mtx te_lock; struct callout te_callout; uint64_t te_tcb[TCB_SIZE / sizeof(uint64_t)]; struct adapter *te_adapter; u_int te_flags; u_int te_tid; uint8_t te_pidx; uint8_t te_sample[100]; }; struct tom_data { struct toedev tod; /* toepcb's associated with this TOE device */ struct mtx toep_list_lock; TAILQ_HEAD(, toepcb) toep_list; TAILQ_HEAD(, synq_entry) synqe_list; + /* List of tids left stranded because hw stopped abruptly. */ + TAILQ_HEAD(, toepcb) stranded_atids; + TAILQ_HEAD(, toepcb) stranded_tids; + TAILQ_HEAD(, synq_entry) stranded_synqe; + struct task cleanup_stranded_tids; struct mtx lctx_hash_lock; LIST_HEAD(, listen_ctx) *listen_hash; u_long listen_mask; int lctx_count; /* # of lctx in the hash table */ struct ppod_region pr; struct rwlock tcb_history_lock __aligned(CACHE_LINE_SIZE); struct tcb_histent **tcb_history; int dupack_threshold; /* WRs that will not be sent to the chip because L2 resolution failed */ struct mtx unsent_wr_lock; STAILQ_HEAD(, wrqe) unsent_wr_list; struct task reclaim_wr_resources; }; static inline struct tom_data * tod_td(struct toedev *tod) { return (__containerof(tod, struct tom_data, tod)); } static inline struct adapter * td_adapter(struct tom_data *td) { return (td->tod.tod_softc); } static inline void set_mbuf_raw_wr(struct mbuf *m, bool raw) { M_ASSERTPKTHDR(m); m->m_pkthdr.PH_per.eight[6] = raw; } static inline bool mbuf_raw_wr(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_pkthdr.PH_per.eight[6]); } static inline void set_mbuf_ulp_submode(struct mbuf *m, uint8_t ulp_submode) { M_ASSERTPKTHDR(m); m->m_pkthdr.PH_per.eight[0] = ulp_submode; } static inline uint8_t mbuf_ulp_submode(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_pkthdr.PH_per.eight[0]); } static inline void set_mbuf_iscsi_iso(struct mbuf *m, bool iso) { M_ASSERTPKTHDR(m); m->m_pkthdr.PH_per.eight[1] = iso; } static inline bool mbuf_iscsi_iso(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_pkthdr.PH_per.eight[1]); } /* Flags for iSCSI segmentation offload. */ #define CXGBE_ISO_TYPE(flags) ((flags) & 0x3) #define CXGBE_ISO_F 0x4 static inline void set_mbuf_iscsi_iso_flags(struct mbuf *m, uint8_t flags) { M_ASSERTPKTHDR(m); m->m_pkthdr.PH_per.eight[2] = flags; } static inline uint8_t mbuf_iscsi_iso_flags(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_pkthdr.PH_per.eight[2]); } static inline void set_mbuf_iscsi_iso_mss(struct mbuf *m, uint16_t mss) { M_ASSERTPKTHDR(m); m->m_pkthdr.PH_per.sixteen[2] = mss; } static inline uint16_t mbuf_iscsi_iso_mss(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_pkthdr.PH_per.sixteen[2]); } /* t4_tom.c */ struct toepcb *alloc_toepcb(struct vi_info *, int); int init_toepcb(struct vi_info *, struct toepcb *); struct toepcb *hold_toepcb(struct toepcb *); void free_toepcb(struct toepcb *); void offload_socket(struct socket *, struct toepcb *); void restore_so_proto(struct socket *, bool); void undo_offload_socket(struct socket *); void final_cpl_received(struct toepcb *); void insert_tid(struct adapter *, int, void *, int); void *lookup_tid(struct adapter *, int); void update_tid(struct adapter *, int, void *); void remove_tid(struct adapter *, int, int); u_long select_rcv_wnd(struct socket *); int select_rcv_wscale(void); void init_conn_params(struct vi_info *, struct offload_settings *, struct in_conninfo *, struct socket *, const struct tcp_options *, int16_t, struct conn_params *cp); __be64 calc_options0(struct vi_info *, struct conn_params *); __be32 calc_options2(struct vi_info *, struct conn_params *); uint64_t select_ntuple(struct vi_info *, struct l2t_entry *); int negative_advice(int); int add_tid_to_history(struct adapter *, u_int); void t4_pcb_detach(struct toedev *, struct tcpcb *); /* t4_connect.c */ void t4_init_connect_cpl_handlers(void); void t4_uninit_connect_cpl_handlers(void); int t4_connect(struct toedev *, struct socket *, struct nhop_object *, struct sockaddr *); -void act_open_failure_cleanup(struct adapter *, u_int, u_int); +void act_open_failure_cleanup(struct adapter *, struct toepcb *, u_int); /* t4_listen.c */ void t4_init_listen_cpl_handlers(void); void t4_uninit_listen_cpl_handlers(void); int t4_listen_start(struct toedev *, struct tcpcb *); int t4_listen_stop(struct toedev *, struct tcpcb *); void t4_syncache_added(struct toedev *, void *); void t4_syncache_removed(struct toedev *, void *); int t4_syncache_respond(struct toedev *, void *, struct mbuf *); int do_abort_req_synqe(struct sge_iq *, const struct rss_header *, struct mbuf *); int do_abort_rpl_synqe(struct sge_iq *, const struct rss_header *, struct mbuf *); void t4_offload_socket(struct toedev *, void *, struct socket *); -void synack_failure_cleanup(struct adapter *, int); +void synack_failure_cleanup(struct adapter *, struct synq_entry *); +int alloc_stid_tab(struct adapter *); +void free_stid_tab(struct adapter *); +void stop_stid_tab(struct adapter *); +void restart_stid_tab(struct adapter *); /* t4_cpl_io.c */ void aiotx_init_toep(struct toepcb *); int t4_aio_queue_aiotx(struct socket *, struct kaiocb *); void t4_init_cpl_io_handlers(void); void t4_uninit_cpl_io_handlers(void); void send_abort_rpl(struct adapter *, struct sge_ofld_txq *, int , int); void send_flowc_wr(struct toepcb *, struct tcpcb *); void send_reset(struct adapter *, struct toepcb *, uint32_t); int send_rx_credits(struct adapter *, struct toepcb *, int); void make_established(struct toepcb *, uint32_t, uint32_t, uint16_t); int t4_close_conn(struct adapter *, struct toepcb *); void t4_rcvd(struct toedev *, struct tcpcb *); void t4_rcvd_locked(struct toedev *, struct tcpcb *); int t4_tod_output(struct toedev *, struct tcpcb *); int t4_send_fin(struct toedev *, struct tcpcb *); int t4_send_rst(struct toedev *, struct tcpcb *); void t4_set_tcb_field(struct adapter *, struct sge_wrq *, struct toepcb *, uint16_t, uint64_t, uint64_t, int, int); void t4_push_frames(struct adapter *, struct toepcb *, int); void t4_push_pdus(struct adapter *, struct toepcb *, int); /* t4_ddp.c */ int t4_init_ppod_region(struct ppod_region *, struct t4_range *, u_int, const char *); void t4_free_ppod_region(struct ppod_region *); int t4_alloc_page_pods_for_ps(struct ppod_region *, struct pageset *); int t4_alloc_page_pods_for_bio(struct ppod_region *, struct bio *, struct ppod_reservation *); int t4_alloc_page_pods_for_buf(struct ppod_region *, vm_offset_t, int, struct ppod_reservation *); int t4_alloc_page_pods_for_sgl(struct ppod_region *, struct ctl_sg_entry *, int, struct ppod_reservation *); int t4_write_page_pods_for_ps(struct adapter *, struct sge_wrq *, int, struct pageset *); int t4_write_page_pods_for_bio(struct adapter *, struct toepcb *, struct ppod_reservation *, struct bio *, struct mbufq *); int t4_write_page_pods_for_buf(struct adapter *, struct toepcb *, struct ppod_reservation *, vm_offset_t, int, struct mbufq *); int t4_write_page_pods_for_sgl(struct adapter *, struct toepcb *, struct ppod_reservation *, struct ctl_sg_entry *, int, int, struct mbufq *); void t4_free_page_pods(struct ppod_reservation *); int t4_aio_queue_ddp(struct socket *, struct kaiocb *); int t4_enable_ddp_rcv(struct socket *, struct toepcb *); void t4_ddp_mod_load(void); void t4_ddp_mod_unload(void); void ddp_assert_empty(struct toepcb *); void ddp_uninit_toep(struct toepcb *); void ddp_queue_toep(struct toepcb *); void release_ddp_resources(struct toepcb *toep); void handle_ddp_close(struct toepcb *, struct tcpcb *, uint32_t); void handle_ddp_indicate(struct toepcb *); void insert_ddp_data(struct toepcb *, uint32_t); const struct offload_settings *lookup_offload_policy(struct adapter *, int, struct mbuf *, uint16_t, struct inpcb *); /* t4_tls.c */ bool can_tls_offload(struct adapter *); void do_rx_data_tls(const struct cpl_rx_data *, struct toepcb *, struct mbuf *); void t4_push_ktls(struct adapter *, struct toepcb *, int); void tls_received_starting_data(struct adapter *, struct toepcb *, struct sockbuf *, int); void t4_tls_mod_load(void); void t4_tls_mod_unload(void); void tls_init_toep(struct toepcb *); int tls_tx_key(struct toepcb *); void tls_uninit_toep(struct toepcb *); int tls_alloc_ktls(struct toepcb *, struct ktls_session *, int); #endif