diff --git a/sys/dev/cxgbe/tom/t4_connect.c b/sys/dev/cxgbe/tom/t4_connect.c index 1c98e70a4df5..99e4c222996d 100644 --- a/sys/dev/cxgbe/tom/t4_connect.c +++ b/sys/dev/cxgbe/tom/t4_connect.c @@ -1,413 +1,427 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #include "opt_inet6.h" #ifdef TCP_OFFLOAD #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include #include #include #include #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "t4_clip.h" #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" /* * Active open succeeded. */ static int do_act_establish(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_act_establish *cpl = (const void *)(rss + 1); u_int tid = GET_TID(cpl); u_int atid = G_TID_TID(ntohl(cpl->tos_atid)); struct toepcb *toep = lookup_atid(sc, atid); struct inpcb *inp = toep->inp; KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(toep->tid == atid, ("%s: toep tid/atid mismatch", __func__)); CTR3(KTR_CXGBE, "%s: atid %u, tid %u", __func__, atid, tid); free_atid(sc, atid); CURVNET_SET(toep->vnet); INP_WLOCK(inp); toep->tid = tid; insert_tid(sc, tid, toep, inp->inp_vflag & INP_IPV6 ? 2 : 1); if (inp->inp_flags & INP_DROPPED) { /* socket closed by the kernel before hw told us it connected */ send_flowc_wr(toep, NULL); send_reset(sc, toep, be32toh(cpl->snd_isn)); goto done; } make_established(toep, be32toh(cpl->snd_isn) - 1, be32toh(cpl->rcv_isn) - 1, cpl->tcp_opt); inp->inp_flowtype = M_HASHTYPE_OPAQUE; inp->inp_flowid = tid; done: INP_WUNLOCK(inp); CURVNET_RESTORE(); return (0); } void act_open_failure_cleanup(struct adapter *sc, struct toepcb *toep, u_int status) { struct inpcb *inp = toep->inp; struct toedev *tod = &toep->td->tod; struct epoch_tracker et; + struct tom_data *td = sc->tom_softc; if (toep->tid >= 0) { free_atid(sc, toep->tid); toep->tid = -1; + mtx_lock(&td->toep_list_lock); + if (toep->flags & TPF_IN_TOEP_LIST) { + toep->flags &= ~TPF_IN_TOEP_LIST; + TAILQ_REMOVE(&td->toep_list, toep, link); + } + mtx_unlock(&td->toep_list_lock); } CURVNET_SET(toep->vnet); if (status != EAGAIN) NET_EPOCH_ENTER(et); INP_WLOCK(inp); toe_connect_failed(tod, inp, status); final_cpl_received(toep); /* unlocks inp */ if (status != EAGAIN) NET_EPOCH_EXIT(et); CURVNET_RESTORE(); } /* * Active open failed. */ static int do_act_open_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_act_open_rpl *cpl = (const void *)(rss + 1); u_int atid = G_TID_TID(G_AOPEN_ATID(be32toh(cpl->atid_status))); u_int status = G_AOPEN_STATUS(be32toh(cpl->atid_status)); struct toepcb *toep = lookup_atid(sc, atid); int rc; KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(toep->tid == atid, ("%s: toep tid/atid mismatch", __func__)); CTR3(KTR_CXGBE, "%s: atid %u, status %u ", __func__, atid, status); /* Ignore negative advice */ if (negative_advice(status)) return (0); if (status && act_open_has_tid(status)) release_tid(sc, GET_TID(cpl), toep->ctrlq); rc = act_open_rpl_status_to_errno(status); act_open_failure_cleanup(sc, toep, rc); return (0); } void t4_init_connect_cpl_handlers(void) { t4_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish); t4_register_shared_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl, CPL_COOKIE_TOM); } void t4_uninit_connect_cpl_handlers(void) { t4_register_cpl_handler(CPL_ACT_ESTABLISH, NULL); t4_register_shared_cpl_handler(CPL_ACT_OPEN_RPL, NULL, CPL_COOKIE_TOM); } #ifdef KTR #define DONT_OFFLOAD_ACTIVE_OPEN(x) do { \ reason = __LINE__; \ rc = (x); \ goto failed; \ } while (0) #else #define DONT_OFFLOAD_ACTIVE_OPEN(x) do { \ rc = (x); \ goto failed; \ } while (0) #endif static inline int act_open_cpl_size(struct adapter *sc, int isipv6) { int idx; static const int sz_table[3][2] = { { sizeof (struct cpl_act_open_req), sizeof (struct cpl_act_open_req6) }, { sizeof (struct cpl_t5_act_open_req), sizeof (struct cpl_t5_act_open_req6) }, { sizeof (struct cpl_t6_act_open_req), sizeof (struct cpl_t6_act_open_req6) }, }; MPASS(chip_id(sc) >= CHELSIO_T4); idx = min(chip_id(sc) - CHELSIO_T4, 2); return (sz_table[idx][!!isipv6]); } /* * active open (soconnect). * * State of affairs on entry: * soisconnecting (so_state |= SS_ISCONNECTING) * tcbinfo not locked (This has changed - used to be WLOCKed) * inp WLOCKed * tp->t_state = TCPS_SYN_SENT * rtalloc1, RT_UNLOCK on rt. */ int t4_connect(struct toedev *tod, struct socket *so, struct nhop_object *nh, struct sockaddr *nam) { struct adapter *sc = tod->tod_softc; + struct tom_data *td; struct toepcb *toep = NULL; struct wrqe *wr = NULL; if_t rt_ifp = nh->nh_ifp; struct vi_info *vi; int qid_atid, rc, isipv6; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); #ifdef KTR int reason; #endif struct offload_settings settings; struct epoch_tracker et; uint16_t vid = 0xfff, pcp = 0; INP_WLOCK_ASSERT(inp); KASSERT(nam->sa_family == AF_INET || nam->sa_family == AF_INET6, ("%s: dest addr %p has family %u", __func__, nam, nam->sa_family)); if (if_gettype(rt_ifp) == IFT_ETHER) vi = if_getsoftc(rt_ifp); else if (if_gettype(rt_ifp) == IFT_L2VLAN) { if_t ifp = VLAN_TRUNKDEV(rt_ifp); vi = if_getsoftc(ifp); VLAN_TAG(rt_ifp, &vid); VLAN_PCP(rt_ifp, &pcp); } else if (if_gettype(rt_ifp) == IFT_IEEE8023ADLAG) DONT_OFFLOAD_ACTIVE_OPEN(ENOSYS); /* XXX: implement lagg+TOE */ else DONT_OFFLOAD_ACTIVE_OPEN(ENOTSUP); if (sc->flags & KERN_TLS_ON) DONT_OFFLOAD_ACTIVE_OPEN(ENOTSUP); rw_rlock(&sc->policy_lock); settings = *lookup_offload_policy(sc, OPEN_TYPE_ACTIVE, NULL, EVL_MAKETAG(vid, pcp, 0), inp); rw_runlock(&sc->policy_lock); if (!settings.offload) DONT_OFFLOAD_ACTIVE_OPEN(EPERM); toep = alloc_toepcb(vi, M_NOWAIT); if (toep == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); toep->tid = alloc_atid(sc, toep); if (toep->tid < 0) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); toep->l2te = t4_l2t_get(vi->pi, rt_ifp, nh->nh_flags & NHF_GATEWAY ? &nh->gw_sa : nam); if (toep->l2te == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); toep->vnet = so->so_vnet; init_conn_params(vi, &settings, &inp->inp_inc, so, NULL, toep->l2te->idx, &toep->params); init_toepcb(vi, toep); isipv6 = nam->sa_family == AF_INET6; wr = alloc_wrqe(act_open_cpl_size(sc, isipv6), toep->ctrlq); if (wr == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); qid_atid = V_TID_QID(toep->ofld_rxq->iq.abs_id) | V_TID_TID(toep->tid) | V_TID_COOKIE(CPL_COOKIE_TOM); if (isipv6) { struct cpl_act_open_req6 *cpl = wrtod(wr); struct cpl_t5_act_open_req6 *cpl5 = (void *)cpl; struct cpl_t6_act_open_req6 *cpl6 = (void *)cpl; if ((inp->inp_vflag & INP_IPV6) == 0) DONT_OFFLOAD_ACTIVE_OPEN(ENOTSUP); toep->ce = t4_get_clip_entry(sc, &inp->in6p_laddr, true); if (toep->ce == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOENT); switch (chip_id(sc)) { case CHELSIO_T4: INIT_TP_WR(cpl, 0); cpl->params = select_ntuple(vi, toep->l2te); break; case CHELSIO_T5: INIT_TP_WR(cpl5, 0); cpl5->iss = htobe32(tp->iss); cpl5->params = select_ntuple(vi, toep->l2te); break; case CHELSIO_T6: default: INIT_TP_WR(cpl6, 0); cpl6->iss = htobe32(tp->iss); cpl6->params = select_ntuple(vi, toep->l2te); break; } OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ6, qid_atid)); cpl->local_port = inp->inp_lport; cpl->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0]; cpl->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8]; cpl->peer_port = inp->inp_fport; cpl->peer_ip_hi = *(uint64_t *)&inp->in6p_faddr.s6_addr[0]; cpl->peer_ip_lo = *(uint64_t *)&inp->in6p_faddr.s6_addr[8]; cpl->opt0 = calc_options0(vi, &toep->params); cpl->opt2 = calc_options2(vi, &toep->params); CTR6(KTR_CXGBE, "%s: atid %u, toep %p, inp %p, opt0 %#016lx, opt2 %#08x", __func__, toep->tid, toep, inp, be64toh(cpl->opt0), be32toh(cpl->opt2)); } else { struct cpl_act_open_req *cpl = wrtod(wr); struct cpl_t5_act_open_req *cpl5 = (void *)cpl; struct cpl_t6_act_open_req *cpl6 = (void *)cpl; switch (chip_id(sc)) { case CHELSIO_T4: INIT_TP_WR(cpl, 0); cpl->params = select_ntuple(vi, toep->l2te); break; case CHELSIO_T5: INIT_TP_WR(cpl5, 0); cpl5->iss = htobe32(tp->iss); cpl5->params = select_ntuple(vi, toep->l2te); break; case CHELSIO_T6: default: INIT_TP_WR(cpl6, 0); cpl6->iss = htobe32(tp->iss); cpl6->params = select_ntuple(vi, toep->l2te); break; } OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, qid_atid)); inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port, &cpl->peer_ip, &cpl->peer_port); cpl->opt0 = calc_options0(vi, &toep->params); cpl->opt2 = calc_options2(vi, &toep->params); CTR6(KTR_CXGBE, "%s: atid %u, toep %p, inp %p, opt0 %#016lx, opt2 %#08x", __func__, toep->tid, toep, inp, be64toh(cpl->opt0), be32toh(cpl->opt2)); } offload_socket(so, toep); + /* Add the TOE PCB to the active list */ + td = toep->td; + mtx_lock(&td->toep_list_lock); + TAILQ_INSERT_TAIL(&td->toep_list, toep, link); + toep->flags |= TPF_IN_TOEP_LIST; + mtx_unlock(&td->toep_list_lock); NET_EPOCH_ENTER(et); rc = t4_l2t_send(sc, wr, toep->l2te); NET_EPOCH_EXIT(et); if (rc == 0) { toep->flags |= TPF_CPL_PENDING; return (0); } undo_offload_socket(so); #if defined(KTR) reason = __LINE__; #endif failed: CTR3(KTR_CXGBE, "%s: not offloading (%d), rc %d", __func__, reason, rc); if (wr) free_wrqe(wr); if (toep) { if (toep->tid >= 0) free_atid(sc, toep->tid); if (toep->l2te) t4_l2t_release(toep->l2te); if (toep->ce) t4_release_clip_entry(sc, toep->ce); free_toepcb(toep); } return (rc); } #endif diff --git a/sys/dev/cxgbe/tom/t4_listen.c b/sys/dev/cxgbe/tom/t4_listen.c index 897c5bcaab1e..f91d193c0fed 100644 --- a/sys/dev/cxgbe/tom/t4_listen.c +++ b/sys/dev/cxgbe/tom/t4_listen.c @@ -1,1722 +1,1725 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #include "opt_inet6.h" #ifdef TCP_OFFLOAD #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include #include #include #include #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "t4_clip.h" #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" /* stid services */ static int alloc_stid(struct adapter *, struct listen_ctx *, int); static struct listen_ctx *lookup_stid(struct adapter *, int); static void free_stid(struct adapter *, struct listen_ctx *); /* lctx services */ static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *, struct vi_info *); static int free_lctx(struct adapter *, struct listen_ctx *); static void hold_lctx(struct listen_ctx *); static void listen_hash_add(struct adapter *, struct listen_ctx *); static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *); static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *); static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *); static void send_abort_rpl_synqe(struct toedev *, struct synq_entry *, int); static int create_server6(struct adapter *, struct listen_ctx *); static int create_server(struct adapter *, struct listen_ctx *); int alloc_stid_tab(struct adapter *sc) { struct tid_info *t = &sc->tids; MPASS(t->nstids > 0); MPASS(t->stid_tab == NULL); t->stid_tab = malloc(t->nstids * sizeof(*t->stid_tab), M_CXGBE, M_ZERO | M_NOWAIT); if (t->stid_tab == NULL) return (ENOMEM); mtx_init(&t->stid_lock, "stid lock", NULL, MTX_DEF); t->stids_in_use = 0; TAILQ_INIT(&t->stids); t->nstids_free_head = t->nstids; return (0); } void free_stid_tab(struct adapter *sc) { struct tid_info *t = &sc->tids; KASSERT(t->stids_in_use == 0, ("%s: %d tids still in use.", __func__, t->stids_in_use)); if (mtx_initialized(&t->stid_lock)) mtx_destroy(&t->stid_lock); free(t->stid_tab, M_CXGBE); t->stid_tab = NULL; } void stop_stid_tab(struct adapter *sc) { struct tid_info *t = &sc->tids; struct tom_data *td = sc->tom_softc; struct listen_ctx *lctx; struct synq_entry *synqe; int i, ntids; mtx_lock(&t->stid_lock); t->stid_tab_stopped = true; mtx_unlock(&t->stid_lock); mtx_lock(&td->lctx_hash_lock); for (i = 0; i <= td->listen_mask; i++) { LIST_FOREACH(lctx, &td->listen_hash[i], link) lctx->flags &= ~(LCTX_RPL_PENDING | LCTX_SETUP_IN_HW); } mtx_unlock(&td->lctx_hash_lock); mtx_lock(&td->toep_list_lock); TAILQ_FOREACH(synqe, &td->synqe_list, link) { MPASS(sc->incarnation == synqe->incarnation); MPASS(synqe->tid >= 0); MPASS(synqe == lookup_tid(sc, synqe->tid)); /* Remove tid from the lookup table immediately. */ CTR(KTR_CXGBE, "%s: tid %d@%d STRANDED, removed from table", __func__, synqe->tid, synqe->incarnation); ntids = synqe->lctx->inp->inp_vflag & INP_IPV6 ? 2 : 1; remove_tid(sc, synqe->tid, ntids); #if 0 /* synqe->tid is stale now but left alone for debug. */ synqe->tid = -1; #endif } MPASS(TAILQ_EMPTY(&td->stranded_synqe)); TAILQ_CONCAT(&td->stranded_synqe, &td->synqe_list, link); MPASS(TAILQ_EMPTY(&td->synqe_list)); mtx_unlock(&td->toep_list_lock); } void restart_stid_tab(struct adapter *sc) { struct tid_info *t = &sc->tids; struct tom_data *td = sc->tom_softc; struct listen_ctx *lctx; int i; mtx_lock(&td->lctx_hash_lock); for (i = 0; i <= td->listen_mask; i++) { LIST_FOREACH(lctx, &td->listen_hash[i], link) { MPASS((lctx->flags & (LCTX_RPL_PENDING | LCTX_SETUP_IN_HW)) == 0); lctx->flags |= LCTX_RPL_PENDING; if (lctx->inp->inp_vflag & INP_IPV6) create_server6(sc, lctx); else create_server(sc, lctx); } } mtx_unlock(&td->lctx_hash_lock); mtx_lock(&t->stid_lock); t->stid_tab_stopped = false; mtx_unlock(&t->stid_lock); } static int alloc_stid(struct adapter *sc, struct listen_ctx *lctx, int isipv6) { struct tid_info *t = &sc->tids; u_int stid, n, f, mask; struct stid_region *sr = &lctx->stid_region; /* * An IPv6 server needs 2 naturally aligned stids (1 stid = 4 cells) in * the TCAM. The start of the stid region is properly aligned (the chip * requires each region to be 128-cell aligned). */ n = isipv6 ? 2 : 1; mask = n - 1; KASSERT((t->stid_base & mask) == 0 && (t->nstids & mask) == 0, ("%s: stid region (%u, %u) not properly aligned. n = %u", __func__, t->stid_base, t->nstids, n)); mtx_lock(&t->stid_lock); if (n > t->nstids - t->stids_in_use || t->stid_tab_stopped) { mtx_unlock(&t->stid_lock); return (-1); } if (t->nstids_free_head >= n) { /* * This allocation will definitely succeed because the region * starts at a good alignment and we just checked we have enough * stids free. */ f = t->nstids_free_head & mask; t->nstids_free_head -= n + f; stid = t->nstids_free_head; TAILQ_INSERT_HEAD(&t->stids, sr, link); } else { struct stid_region *s; stid = t->nstids_free_head; TAILQ_FOREACH(s, &t->stids, link) { stid += s->used + s->free; f = stid & mask; if (s->free >= n + f) { stid -= n + f; s->free -= n + f; TAILQ_INSERT_AFTER(&t->stids, s, sr, link); goto allocated; } } if (__predict_false(stid != t->nstids)) { panic("%s: stids TAILQ (%p) corrupt." " At %d instead of %d at the end of the queue.", __func__, &t->stids, stid, t->nstids); } mtx_unlock(&t->stid_lock); return (-1); } allocated: sr->used = n; sr->free = f; t->stids_in_use += n; t->stid_tab[stid] = lctx; mtx_unlock(&t->stid_lock); KASSERT(((stid + t->stid_base) & mask) == 0, ("%s: EDOOFUS.", __func__)); return (stid + t->stid_base); } static struct listen_ctx * lookup_stid(struct adapter *sc, int stid) { struct tid_info *t = &sc->tids; return (t->stid_tab[stid - t->stid_base]); } static void free_stid(struct adapter *sc, struct listen_ctx *lctx) { struct tid_info *t = &sc->tids; struct stid_region *sr = &lctx->stid_region; struct stid_region *s; KASSERT(sr->used > 0, ("%s: nonsense free (%d)", __func__, sr->used)); mtx_lock(&t->stid_lock); s = TAILQ_PREV(sr, stid_head, link); if (s != NULL) s->free += sr->used + sr->free; else t->nstids_free_head += sr->used + sr->free; KASSERT(t->stids_in_use >= sr->used, ("%s: stids_in_use (%u) < stids being freed (%u)", __func__, t->stids_in_use, sr->used)); t->stids_in_use -= sr->used; TAILQ_REMOVE(&t->stids, sr, link); mtx_unlock(&t->stid_lock); } static struct listen_ctx * alloc_lctx(struct adapter *sc, struct inpcb *inp, struct vi_info *vi) { struct listen_ctx *lctx; INP_WLOCK_ASSERT(inp); lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO); if (lctx == NULL) return (NULL); lctx->stid = alloc_stid(sc, lctx, inp->inp_vflag & INP_IPV6); if (lctx->stid < 0) { free(lctx, M_CXGBE); return (NULL); } if (inp->inp_vflag & INP_IPV6 && !IN6_ARE_ADDR_EQUAL(&in6addr_any, &inp->in6p_laddr)) { lctx->ce = t4_get_clip_entry(sc, &inp->in6p_laddr, true); if (lctx->ce == NULL) { free(lctx, M_CXGBE); return (NULL); } } lctx->ctrlq = &sc->sge.ctrlq[vi->pi->port_id]; lctx->ofld_rxq = &sc->sge.ofld_rxq[vi->first_ofld_rxq]; refcount_init(&lctx->refcount, 1); lctx->inp = inp; lctx->vnet = inp->inp_socket->so_vnet; in_pcbref(inp); return (lctx); } /* Don't call this directly, use release_lctx instead */ static int free_lctx(struct adapter *sc, struct listen_ctx *lctx) { struct inpcb *inp = lctx->inp; INP_WLOCK_ASSERT(inp); KASSERT(lctx->refcount == 0, ("%s: refcount %d", __func__, lctx->refcount)); KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid)); CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p", __func__, lctx->stid, lctx, lctx->inp); if (lctx->ce) t4_release_clip_entry(sc, lctx->ce); free_stid(sc, lctx); free(lctx, M_CXGBE); return (in_pcbrele_wlocked(inp)); } static void hold_lctx(struct listen_ctx *lctx) { refcount_acquire(&lctx->refcount); } static inline uint32_t listen_hashfn(void *key, u_long mask) { return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask); } /* * Add a listen_ctx entry to the listen hash table. */ static void listen_hash_add(struct adapter *sc, struct listen_ctx *lctx) { struct tom_data *td = sc->tom_softc; int bucket = listen_hashfn(lctx->inp, td->listen_mask); mtx_lock(&td->lctx_hash_lock); LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link); td->lctx_count++; mtx_unlock(&td->lctx_hash_lock); } /* * Look for the listening socket's context entry in the hash and return it. */ static struct listen_ctx * listen_hash_find(struct adapter *sc, struct inpcb *inp) { struct tom_data *td = sc->tom_softc; int bucket = listen_hashfn(inp, td->listen_mask); struct listen_ctx *lctx; mtx_lock(&td->lctx_hash_lock); LIST_FOREACH(lctx, &td->listen_hash[bucket], link) { if (lctx->inp == inp) break; } mtx_unlock(&td->lctx_hash_lock); return (lctx); } /* * Removes the listen_ctx structure for inp from the hash and returns it. */ static struct listen_ctx * listen_hash_del(struct adapter *sc, struct inpcb *inp) { struct tom_data *td = sc->tom_softc; int bucket = listen_hashfn(inp, td->listen_mask); struct listen_ctx *lctx, *l; mtx_lock(&td->lctx_hash_lock); LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) { if (lctx->inp == inp) { LIST_REMOVE(lctx, link); td->lctx_count--; break; } } mtx_unlock(&td->lctx_hash_lock); return (lctx); } /* * Releases a hold on the lctx. Must be called with the listening socket's inp * locked. The inp may be freed by this function and it returns NULL to * indicate this. */ static struct inpcb * release_lctx(struct adapter *sc, struct listen_ctx *lctx) { struct inpcb *inp = lctx->inp; int inp_freed = 0; INP_WLOCK_ASSERT(inp); if (refcount_release(&lctx->refcount)) inp_freed = free_lctx(sc, lctx); return (inp_freed ? NULL : inp); } static void send_flowc_wr_synqe(struct adapter *sc, struct synq_entry *synqe) { struct mbuf *m = synqe->syn; if_t ifp = m->m_pkthdr.rcvif; struct vi_info *vi = if_getsoftc(ifp); struct port_info *pi = vi->pi; struct wrqe *wr; struct fw_flowc_wr *flowc; struct sge_ofld_txq *ofld_txq; struct sge_ofld_rxq *ofld_rxq; const int nparams = 6; const int flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); const u_int pfvf = sc->pf << S_FW_VIID_PFN; INP_WLOCK_ASSERT(synqe->lctx->inp); MPASS((synqe->flags & TPF_FLOWC_WR_SENT) == 0); ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx]; ofld_rxq = &sc->sge.ofld_rxq[synqe->params.rxq_idx]; wr = alloc_wrqe(roundup2(flowclen, 16), &ofld_txq->wrq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } flowc = wrtod(wr); memset(flowc, 0, wr->wr_len); flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | V_FW_FLOWC_WR_NPARAMS(nparams)); flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | V_FW_WR_FLOWID(synqe->tid)); flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN; flowc->mnemval[0].val = htobe32(pfvf); flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH; flowc->mnemval[1].val = htobe32(pi->tx_chan); flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT; flowc->mnemval[2].val = htobe32(pi->tx_chan); flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID; flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id); flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF; flowc->mnemval[4].val = htobe32(512); flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS; flowc->mnemval[5].val = htobe32(512); synqe->flags |= TPF_FLOWC_WR_SENT; t4_wrq_tx(sc, wr); } static void send_abort_rpl_synqe(struct toedev *tod, struct synq_entry *synqe, int rst_status) { struct adapter *sc = tod->tod_softc; struct wrqe *wr; struct cpl_abort_req *req; INP_WLOCK_ASSERT(synqe->lctx->inp); CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s", __func__, synqe, synqe->flags, synqe->tid, synqe->flags & TPF_ABORT_SHUTDOWN ? " (abort already in progress)" : ""); if (synqe->flags & TPF_ABORT_SHUTDOWN) return; /* abort already in progress */ synqe->flags |= TPF_ABORT_SHUTDOWN; if (!(synqe->flags & TPF_FLOWC_WR_SENT)) send_flowc_wr_synqe(sc, synqe); wr = alloc_wrqe(sizeof(*req), &sc->sge.ofld_txq[synqe->params.txq_idx].wrq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } req = wrtod(wr); INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid); req->rsvd0 = 0; /* don't have a snd_nxt */ req->rsvd1 = 1; /* no data sent yet */ req->cmd = rst_status; t4_l2t_send(sc, wr, &sc->l2t->l2tab[synqe->params.l2t_idx]); } static int create_server(struct adapter *sc, struct listen_ctx *lctx) { struct wrqe *wr; struct cpl_pass_open_req *req; struct inpcb *inp = lctx->inp; wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); if (wr == NULL) { log(LOG_ERR, "%s: allocation failure", __func__); return (ENOMEM); } req = wrtod(wr); INIT_TP_WR(req, 0); OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid)); req->local_port = inp->inp_lport; req->peer_port = 0; req->local_ip = inp->inp_laddr.s_addr; req->peer_ip = 0; req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan)); req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) | F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id)); t4_wrq_tx(sc, wr); return (0); } static int create_server6(struct adapter *sc, struct listen_ctx *lctx) { struct wrqe *wr; struct cpl_pass_open_req6 *req; struct inpcb *inp = lctx->inp; wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); if (wr == NULL) { log(LOG_ERR, "%s: allocation failure", __func__); return (ENOMEM); } req = wrtod(wr); INIT_TP_WR(req, 0); OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid)); req->local_port = inp->inp_lport; req->peer_port = 0; req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0]; req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8]; req->peer_ip_hi = 0; req->peer_ip_lo = 0; req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan)); req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) | F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id)); t4_wrq_tx(sc, wr); return (0); } static int destroy_server(struct adapter *sc, struct listen_ctx *lctx) { struct wrqe *wr; struct cpl_close_listsvr_req *req; wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } req = wrtod(wr); INIT_TP_WR(req, 0); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, lctx->stid)); req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id); req->rsvd = htobe16(0); t4_wrq_tx(sc, wr); return (0); } /* * Start a listening server by sending a passive open request to HW. * * Can't take adapter lock here and access to sc->flags, * sc->offload_map, if_capenable are all race prone. */ int t4_listen_start(struct toedev *tod, struct tcpcb *tp) { struct adapter *sc = tod->tod_softc; struct vi_info *vi; struct port_info *pi; struct inpcb *inp = tptoinpcb(tp); struct listen_ctx *lctx; int i, rc, v; struct offload_settings settings; INP_WLOCK_ASSERT(inp); rw_rlock(&sc->policy_lock); settings = *lookup_offload_policy(sc, OPEN_TYPE_LISTEN, NULL, EVL_MAKETAG(0xfff, 0, 0), inp); rw_runlock(&sc->policy_lock); if (!settings.offload) return (0); /* Don't start a hardware listener for any loopback address. */ if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr)) return (0); if (!(inp->inp_vflag & INP_IPV6) && IN_LOOPBACK(ntohl(inp->inp_laddr.s_addr))) return (0); if (sc->flags & KERN_TLS_ON) return (0); #if 0 ADAPTER_LOCK(sc); if (IS_BUSY(sc)) { log(LOG_ERR, "%s: listen request ignored, %s is busy", __func__, device_get_nameunit(sc->dev)); goto done; } KASSERT(uld_active(sc, ULD_TOM), ("%s: TOM not initialized", __func__)); #endif /* * Find an initialized VI with IFCAP_TOE (4 or 6). We'll use the first * such VI's queues to send the passive open and receive the reply to * it. * * XXX: need a way to mark a port in use by offload. if_cxgbe should * then reject any attempt to bring down such a port (and maybe reject * attempts to disable IFCAP_TOE on that port too?). */ for_each_port(sc, i) { pi = sc->port[i]; for_each_vi(pi, v, vi) { if (vi->flags & VI_INIT_DONE && if_getcapenable(vi->ifp) & IFCAP_TOE) goto found; } } goto done; /* no port that's UP with IFCAP_TOE enabled */ found: if (listen_hash_find(sc, inp) != NULL) goto done; /* already setup */ lctx = alloc_lctx(sc, inp, vi); if (lctx == NULL) { log(LOG_ERR, "%s: listen request ignored, %s couldn't allocate lctx\n", __func__, device_get_nameunit(sc->dev)); goto done; } listen_hash_add(sc, lctx); CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x", __func__, lctx->stid, tcpstates[tp->t_state], lctx, inp, inp->inp_vflag); if (inp->inp_vflag & INP_IPV6) rc = create_server6(sc, lctx); else rc = create_server(sc, lctx); if (rc != 0) { log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n", __func__, device_get_nameunit(sc->dev), rc); (void) listen_hash_del(sc, inp); inp = release_lctx(sc, lctx); /* can't be freed, host stack has a reference */ KASSERT(inp != NULL, ("%s: inp freed", __func__)); goto done; } lctx->flags |= LCTX_RPL_PENDING; done: #if 0 ADAPTER_UNLOCK(sc); #endif return (0); } int t4_listen_stop(struct toedev *tod, struct tcpcb *tp) { struct listen_ctx *lctx; struct adapter *sc = tod->tod_softc; struct inpcb *inp = tptoinpcb(tp); INP_WLOCK_ASSERT(inp); lctx = listen_hash_del(sc, inp); if (lctx == NULL) return (ENOENT); /* no hardware listener for this inp */ CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid, lctx, lctx->flags); /* * If the reply to the PASS_OPEN is still pending we'll wait for it to * arrive and clean up when it does. */ if (lctx->flags & LCTX_RPL_PENDING) { return (EINPROGRESS); } if (lctx->flags & LCTX_SETUP_IN_HW) destroy_server(sc, lctx); return (0); } static inline struct synq_entry * alloc_synqe(struct adapter *sc, struct listen_ctx *lctx, int flags) { struct synq_entry *synqe; INP_RLOCK_ASSERT(lctx->inp); MPASS(flags == M_WAITOK || flags == M_NOWAIT); synqe = malloc(sizeof(*synqe), M_CXGBE, flags); if (__predict_true(synqe != NULL)) { synqe->flags = TPF_SYNQE; synqe->incarnation = sc->incarnation; refcount_init(&synqe->refcnt, 1); synqe->lctx = lctx; hold_lctx(lctx); /* Every synqe has a ref on its lctx. */ synqe->syn = NULL; } return (synqe); } static inline void hold_synqe(struct synq_entry *synqe) { refcount_acquire(&synqe->refcnt); } static inline struct inpcb * release_synqe(struct adapter *sc, struct synq_entry *synqe) { struct inpcb *inp; MPASS(synqe->flags & TPF_SYNQE); MPASS(synqe->lctx != NULL); inp = synqe->lctx->inp; MPASS(inp != NULL); INP_WLOCK_ASSERT(inp); if (refcount_release(&synqe->refcnt)) { inp = release_lctx(sc, synqe->lctx); m_freem(synqe->syn); free(synqe, M_CXGBE); } return (inp); } void t4_syncache_added(struct toedev *tod __unused, void *arg) { struct synq_entry *synqe = arg; hold_synqe(synqe); } void t4_syncache_removed(struct toedev *tod, void *arg) { struct adapter *sc = tod->tod_softc; struct synq_entry *synqe = arg; struct inpcb *inp = synqe->lctx->inp; /* * XXX: this is a LOR but harmless when running from the softclock. */ INP_WLOCK(inp); inp = release_synqe(sc, synqe); if (inp != NULL) INP_WUNLOCK(inp); } int t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m) { struct synq_entry *synqe = arg; if (atomic_fetchadd_int(&synqe->ok_to_respond, 1) == 0) { struct tcpopt to; struct ip *ip = mtod(m, struct ip *); struct tcphdr *th; if (ip->ip_v == IPVERSION) th = (void *)(ip + 1); else th = (void *)((struct ip6_hdr *)ip + 1); bzero(&to, sizeof(to)); tcp_dooptions(&to, (void *)(th + 1), (th->th_off << 2) - sizeof(*th), TO_SYN); /* save these for later */ synqe->iss = be32toh(th->th_seq); synqe->irs = be32toh(th->th_ack) - 1; synqe->ts = to.to_tsval; } m_freem(m); /* don't need this any more */ return (0); } static int do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1); int stid = GET_TID(cpl); unsigned int status = cpl->status; struct listen_ctx *lctx = lookup_stid(sc, stid); struct inpcb *inp = lctx->inp; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_PASS_OPEN_RPL, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); INP_WLOCK(inp); CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x", __func__, stid, status, lctx->flags); lctx->flags &= ~LCTX_RPL_PENDING; if (status == CPL_ERR_NONE) lctx->flags |= LCTX_SETUP_IN_HW; else log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status); #ifdef INVARIANTS /* * If the inp has been dropped (listening socket closed) then * listen_stop must have run and taken the inp out of the hash. */ if (inp->inp_flags & INP_DROPPED) { KASSERT(listen_hash_del(sc, inp) == NULL, ("%s: inp %p still in listen hash", __func__, inp)); } #endif if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) { if (release_lctx(sc, lctx) != NULL) INP_WUNLOCK(inp); return (status); } /* * Listening socket stopped listening earlier and now the chip tells us * it has started the hardware listener. Stop it; the lctx will be * released in do_close_server_rpl. */ if (inp->inp_flags & INP_DROPPED) { destroy_server(sc, lctx); INP_WUNLOCK(inp); return (status); } /* * Failed to start hardware listener. Take inp out of the hash and * release our reference on it. An error message has been logged * already. */ if (status != CPL_ERR_NONE) { listen_hash_del(sc, inp); if (release_lctx(sc, lctx) != NULL) INP_WUNLOCK(inp); return (status); } /* hardware listener open for business */ INP_WUNLOCK(inp); return (status); } static int do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1); int stid = GET_TID(cpl); unsigned int status = cpl->status; struct listen_ctx *lctx = lookup_stid(sc, stid); struct inpcb *inp = lctx->inp; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status); if (status != CPL_ERR_NONE) { log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n", __func__, status, stid); return (status); } INP_WLOCK(inp); inp = release_lctx(sc, lctx); if (inp != NULL) INP_WUNLOCK(inp); return (status); } static void done_with_synqe(struct adapter *sc, struct synq_entry *synqe) { struct tom_data *td = sc->tom_softc; struct listen_ctx *lctx = synqe->lctx; struct inpcb *inp = lctx->inp; struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx]; int ntids; INP_WLOCK_ASSERT(inp); if (synqe->tid != -1) { ntids = inp->inp_vflag & INP_IPV6 ? 2 : 1; remove_tid(sc, synqe->tid, ntids); mtx_lock(&td->toep_list_lock); TAILQ_REMOVE(&td->synqe_list, synqe, link); mtx_unlock(&td->toep_list_lock); release_tid(sc, synqe->tid, lctx->ctrlq); } t4_l2t_release(e); inp = release_synqe(sc, synqe); if (inp) INP_WUNLOCK(inp); } void synack_failure_cleanup(struct adapter *sc, struct synq_entry *synqe) { INP_WLOCK(synqe->lctx->inp); done_with_synqe(sc, synqe); } int do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct synq_entry *synqe = lookup_tid(sc, tid); struct listen_ctx *lctx = synqe->lctx; struct inpcb *inp = lctx->inp; struct sge_ofld_txq *ofld_txq; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_ABORT_REQ_RSS, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__)); CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d", __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status); if (negative_advice(cpl->status)) return (0); /* Ignore negative advice */ INP_WLOCK(inp); ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx]; if (!(synqe->flags & TPF_FLOWC_WR_SENT)) send_flowc_wr_synqe(sc, synqe); /* * If we'd initiated an abort earlier the reply to it is responsible for * cleaning up resources. Otherwise we tear everything down right here * right now. We owe the T4 a CPL_ABORT_RPL no matter what. */ if (synqe->flags & TPF_ABORT_SHUTDOWN) { INP_WUNLOCK(inp); goto done; } done_with_synqe(sc, synqe); /* inp lock released by done_with_synqe */ done: send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); return (0); } int do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct synq_entry *synqe = lookup_tid(sc, tid); struct listen_ctx *lctx = synqe->lctx; struct inpcb *inp = lctx->inp; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_ABORT_RPL_RSS, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__)); CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d", __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status); INP_WLOCK(inp); KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN, ("%s: wasn't expecting abort reply for synqe %p (0x%x)", __func__, synqe, synqe->flags)); done_with_synqe(sc, synqe); /* inp lock released by done_with_synqe */ return (0); } void t4_offload_socket(struct toedev *tod, void *arg, struct socket *so) { struct adapter *sc = tod->tod_softc; struct tom_data *td = sc->tom_softc; struct synq_entry *synqe = arg; struct inpcb *inp = sotoinpcb(so); struct toepcb *toep = synqe->toep; NET_EPOCH_ASSERT(); /* prevents bad race with accept() */ INP_WLOCK_ASSERT(inp); KASSERT(synqe->flags & TPF_SYNQE, ("%s: %p not a synq_entry?", __func__, arg)); MPASS(toep->tid == synqe->tid); offload_socket(so, toep); make_established(toep, synqe->iss, synqe->irs, synqe->tcp_opt); toep->flags |= TPF_CPL_PENDING; update_tid(sc, synqe->tid, toep); synqe->flags |= TPF_SYNQE_EXPANDED; mtx_lock(&td->toep_list_lock); + /* Remove synqe from its list and add the TOE PCB to the active list. */ TAILQ_REMOVE(&td->synqe_list, synqe, link); + TAILQ_INSERT_TAIL(&td->toep_list, toep, link); + toep->flags |= TPF_IN_TOEP_LIST; mtx_unlock(&td->toep_list_lock); inp->inp_flowtype = (inp->inp_vflag & INP_IPV6) ? M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4; inp->inp_flowid = synqe->rss_hash; } static void t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to) { bzero(to, sizeof(*to)); if (t4opt->mss) { to->to_flags |= TOF_MSS; to->to_mss = be16toh(t4opt->mss); } if (t4opt->wsf > 0 && t4opt->wsf < 15) { to->to_flags |= TOF_SCALE; to->to_wscale = t4opt->wsf; } if (t4opt->tstamp) to->to_flags |= TOF_TS; if (t4opt->sack) to->to_flags |= TOF_SACKPERM; } static bool encapsulated_syn(struct adapter *sc, const struct cpl_pass_accept_req *cpl) { u_int hlen = be32toh(cpl->hdr_len); if (chip_id(sc) >= CHELSIO_T6) return (G_T6_ETH_HDR_LEN(hlen) > sizeof(struct ether_vlan_header)); else return (G_ETH_HDR_LEN(hlen) > sizeof(struct ether_vlan_header)); } static void pass_accept_req_to_protohdrs(struct adapter *sc, const struct mbuf *m, struct in_conninfo *inc, struct tcphdr *th, uint8_t *iptos) { const struct cpl_pass_accept_req *cpl = mtod(m, const void *); const struct ether_header *eh; unsigned int hlen = be32toh(cpl->hdr_len); uintptr_t l3hdr; const struct tcphdr *tcp; eh = (const void *)(cpl + 1); if (chip_id(sc) >= CHELSIO_T6) { l3hdr = ((uintptr_t)eh + G_T6_ETH_HDR_LEN(hlen)); tcp = (const void *)(l3hdr + G_T6_IP_HDR_LEN(hlen)); } else { l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen)); tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen)); } /* extract TOS (DiffServ + ECN) byte for AccECN */ if (iptos) { if (((struct ip *)l3hdr)->ip_v == IPVERSION) { const struct ip *ip = (const void *)l3hdr; *iptos = ip->ip_tos; } #ifdef INET6 else if (((struct ip *)l3hdr)->ip_v == (IPV6_VERSION >> 4)) { const struct ip6_hdr *ip6 = (const void *)l3hdr; *iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; } #endif /* INET */ } if (inc) { bzero(inc, sizeof(*inc)); inc->inc_fport = tcp->th_sport; inc->inc_lport = tcp->th_dport; if (((struct ip *)l3hdr)->ip_v == IPVERSION) { const struct ip *ip = (const void *)l3hdr; inc->inc_faddr = ip->ip_src; inc->inc_laddr = ip->ip_dst; } else { const struct ip6_hdr *ip6 = (const void *)l3hdr; inc->inc_flags |= INC_ISIPV6; inc->inc6_faddr = ip6->ip6_src; inc->inc6_laddr = ip6->ip6_dst; } } if (th) { bcopy(tcp, th, sizeof(*th)); tcp_fields_to_host(th); /* just like tcp_input */ } } static struct l2t_entry * get_l2te_for_nexthop(struct port_info *pi, if_t ifp, struct in_conninfo *inc) { struct l2t_entry *e; struct sockaddr_in6 sin6; struct sockaddr *dst = (void *)&sin6; struct nhop_object *nh; if (inc->inc_flags & INC_ISIPV6) { bzero(dst, sizeof(struct sockaddr_in6)); dst->sa_len = sizeof(struct sockaddr_in6); dst->sa_family = AF_INET6; if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) { /* no need for route lookup */ e = t4_l2t_get(pi, ifp, dst); return (e); } nh = fib6_lookup(RT_DEFAULT_FIB, &inc->inc6_faddr, 0, NHR_NONE, 0); if (nh == NULL) return (NULL); if (nh->nh_ifp != ifp) return (NULL); if (nh->nh_flags & NHF_GATEWAY) ((struct sockaddr_in6 *)dst)->sin6_addr = nh->gw6_sa.sin6_addr; else ((struct sockaddr_in6 *)dst)->sin6_addr = inc->inc6_faddr; } else { dst->sa_len = sizeof(struct sockaddr_in); dst->sa_family = AF_INET; nh = fib4_lookup(RT_DEFAULT_FIB, inc->inc_faddr, 0, NHR_NONE, 0); if (nh == NULL) return (NULL); if (nh->nh_ifp != ifp) return (NULL); if (nh->nh_flags & NHF_GATEWAY) if (nh->gw_sa.sa_family == AF_INET) ((struct sockaddr_in *)dst)->sin_addr = nh->gw4_sa.sin_addr; else *((struct sockaddr_in6 *)dst) = nh->gw6_sa; else ((struct sockaddr_in *)dst)->sin_addr = inc->inc_faddr; } e = t4_l2t_get(pi, ifp, dst); return (e); } static int send_synack(struct adapter *sc, struct synq_entry *synqe, uint64_t opt0, uint32_t opt2, int tid) { struct wrqe *wr; struct cpl_pass_accept_rpl *rpl; struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx]; wr = alloc_wrqe(is_t4(sc) ? sizeof(struct cpl_pass_accept_rpl) : sizeof(struct cpl_t5_pass_accept_rpl), &sc->sge.ctrlq[0]); if (wr == NULL) return (ENOMEM); rpl = wrtod(wr); if (is_t4(sc)) INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid); else { struct cpl_t5_pass_accept_rpl *rpl5 = (void *)rpl; INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid); rpl5->iss = htobe32(synqe->iss); } rpl->opt0 = opt0; rpl->opt2 = opt2; return (t4_l2t_send(sc, wr, e)); } #define REJECT_PASS_ACCEPT_REQ(tunnel) do { \ if (!tunnel) { \ m_freem(m); \ m = NULL; \ } \ reject_reason = __LINE__; \ goto reject; \ } while (0) /* * The context associated with a tid entry via insert_tid could be a synq_entry * or a toepcb. The only way CPL handlers can tell is via a bit in these flags. */ CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags)); /* * Incoming SYN on a listening socket. * * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe, * etc. */ static int do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; struct tom_data *td = sc->tom_softc; struct toedev *tod; const struct cpl_pass_accept_req *cpl = mtod(m, const void *); unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid)); unsigned int tid = GET_TID(cpl); struct listen_ctx *lctx = lookup_stid(sc, stid); struct inpcb *inp; struct socket *so; struct in_conninfo inc; struct tcphdr th; struct tcpopt to; struct port_info *pi; struct vi_info *vi; if_t hw_ifp, ifp; struct l2t_entry *e = NULL; struct synq_entry *synqe = NULL; int reject_reason, v, ntids; uint16_t vid, l2info; struct epoch_tracker et; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif struct offload_settings settings; uint8_t iptos; KASSERT(opcode == CPL_PASS_ACCEPT_REQ, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid, lctx); /* * Figure out the port the SYN arrived on. We'll look for an exact VI * match in a bit but in case we don't find any we'll use the main VI as * the incoming ifnet. */ l2info = be16toh(cpl->l2info); pi = sc->port[G_SYN_INTF(l2info)]; hw_ifp = pi->vi[0].ifp; m->m_pkthdr.rcvif = hw_ifp; CURVNET_SET(lctx->vnet); /* before any potential REJECT */ /* * If VXLAN/NVGRE parsing is enabled then SYNs in the inner traffic will * also hit the listener. We don't want to offload those. */ if (encapsulated_syn(sc, cpl)) { REJECT_PASS_ACCEPT_REQ(true); } /* * Use the MAC index to lookup the associated VI. If this SYN didn't * match a perfect MAC filter, punt. */ if (!(l2info & F_SYN_XACT_MATCH)) { REJECT_PASS_ACCEPT_REQ(true); } for_each_vi(pi, v, vi) { if (vi->xact_addr_filt == G_SYN_MAC_IDX(l2info)) goto found; } REJECT_PASS_ACCEPT_REQ(true); found: hw_ifp = vi->ifp; /* the cxgbe ifnet */ m->m_pkthdr.rcvif = hw_ifp; tod = TOEDEV(hw_ifp); /* * Don't offload if the peer requested a TCP option that's not known to * the silicon. Send the SYN to the kernel instead. */ if (__predict_false(cpl->tcpopt.unknown)) REJECT_PASS_ACCEPT_REQ(true); /* * Figure out if there is a pseudo interface (vlan, lagg, etc.) * involved. Don't offload if the SYN had a VLAN tag and the vid * doesn't match anything on this interface. * * XXX: lagg support, lagg + vlan support. */ vid = EVL_VLANOFTAG(be16toh(cpl->vlan)); if (vid != 0xfff && vid != 0) { ifp = VLAN_DEVAT(hw_ifp, vid); if (ifp == NULL) REJECT_PASS_ACCEPT_REQ(true); } else ifp = hw_ifp; /* * Don't offload if the ifnet that the SYN came in on is not in the same * vnet as the listening socket. */ if (lctx->vnet != if_getvnet(ifp)) REJECT_PASS_ACCEPT_REQ(true); pass_accept_req_to_protohdrs(sc, m, &inc, &th, &iptos); if (inc.inc_flags & INC_ISIPV6) { /* Don't offload if the ifcap isn't enabled */ if ((if_getcapenable(ifp) & IFCAP_TOE6) == 0) REJECT_PASS_ACCEPT_REQ(true); /* * SYN must be directed to an IP6 address on this ifnet. This * is more restrictive than in6_localip. */ NET_EPOCH_ENTER(et); if (!in6_ifhasaddr(ifp, &inc.inc6_laddr)) { NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(true); } ntids = 2; } else { /* Don't offload if the ifcap isn't enabled */ if ((if_getcapenable(ifp) & IFCAP_TOE4) == 0) REJECT_PASS_ACCEPT_REQ(true); /* * SYN must be directed to an IP address on this ifnet. This * is more restrictive than in_localip. */ NET_EPOCH_ENTER(et); if (!in_ifhasaddr(ifp, inc.inc_laddr)) { NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(true); } ntids = 1; } e = get_l2te_for_nexthop(pi, ifp, &inc); if (e == NULL) { NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(true); } /* Don't offload if the 4-tuple is already in use */ if (toe_4tuple_check(&inc, &th, ifp) != 0) { NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(false); } inp = lctx->inp; /* listening socket, not owned by TOE */ INP_RLOCK(inp); /* Don't offload if the listening socket has closed */ if (__predict_false(inp->inp_flags & INP_DROPPED)) { INP_RUNLOCK(inp); NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(false); } so = inp->inp_socket; rw_rlock(&sc->policy_lock); settings = *lookup_offload_policy(sc, OPEN_TYPE_PASSIVE, m, EVL_MAKETAG(0xfff, 0, 0), inp); rw_runlock(&sc->policy_lock); if (!settings.offload) { INP_RUNLOCK(inp); NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(true); /* Rejected by COP. */ } synqe = alloc_synqe(sc, lctx, M_NOWAIT); if (synqe == NULL) { INP_RUNLOCK(inp); NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(true); } MPASS(rss->hash_type == RSS_HASH_TCP); synqe->rss_hash = be32toh(rss->hash_val); atomic_store_int(&synqe->ok_to_respond, 0); init_conn_params(vi, &settings, &inc, so, &cpl->tcpopt, e->idx, &synqe->params); /* * If all goes well t4_syncache_respond will get called during * syncache_add. Note that syncache_add releases the pcb lock. */ t4opt_to_tcpopt(&cpl->tcpopt, &to); toe_syncache_add(&inc, &to, &th, inp, tod, synqe, iptos); if (atomic_load_int(&synqe->ok_to_respond) > 0) { uint64_t opt0; uint32_t opt2; opt0 = calc_options0(vi, &synqe->params); opt2 = calc_options2(vi, &synqe->params); insert_tid(sc, tid, synqe, ntids); synqe->tid = tid; synqe->syn = m; m = NULL; if (send_synack(sc, synqe, opt0, opt2, tid) != 0) { remove_tid(sc, tid, ntids); m = synqe->syn; synqe->syn = NULL; NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(true); } mtx_lock(&td->toep_list_lock); TAILQ_INSERT_TAIL(&td->synqe_list, synqe, link); mtx_unlock(&td->toep_list_lock); CTR6(KTR_CXGBE, "%s: stid %u, tid %u, synqe %p, opt0 %#016lx, opt2 %#08x", __func__, stid, tid, synqe, be64toh(opt0), be32toh(opt2)); } else { NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(false); } NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); reject: CURVNET_RESTORE(); CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid, reject_reason); if (e) t4_l2t_release(e); release_tid(sc, tid, lctx->ctrlq); if (synqe) { inp = synqe->lctx->inp; INP_WLOCK(inp); inp = release_synqe(sc, synqe); if (inp) INP_WUNLOCK(inp); } if (m) { /* * The connection request hit a TOE listener but is being passed * on to the kernel sw stack instead of getting offloaded. */ m_adj(m, sizeof(*cpl)); m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m->m_pkthdr.csum_data = 0xffff; if_input(hw_ifp, m); } return (reject_reason); } static void synqe_to_protohdrs(struct adapter *sc, struct synq_entry *synqe, const struct cpl_pass_establish *cpl, struct in_conninfo *inc, struct tcphdr *th, struct tcpopt *to) { uint16_t tcp_opt = be16toh(cpl->tcp_opt); uint8_t iptos; /* start off with the original SYN */ pass_accept_req_to_protohdrs(sc, synqe->syn, inc, th, &iptos); /* modify parts to make it look like the ACK to our SYN|ACK */ th->th_flags = TH_ACK; th->th_ack = synqe->iss + 1; th->th_seq = be32toh(cpl->rcv_isn); bzero(to, sizeof(*to)); if (G_TCPOPT_TSTAMP(tcp_opt)) { to->to_flags |= TOF_TS; to->to_tsecr = synqe->ts; } } static int do_pass_establish(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; struct vi_info *vi; if_t ifp; const struct cpl_pass_establish *cpl = (const void *)(rss + 1); #if defined(KTR) || defined(INVARIANTS) unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid)); #endif unsigned int tid = GET_TID(cpl); struct synq_entry *synqe = lookup_tid(sc, tid); struct listen_ctx *lctx = synqe->lctx; struct inpcb *inp = lctx->inp, *new_inp; struct socket *so; struct tcphdr th; struct tcpopt to; struct in_conninfo inc; struct toepcb *toep; struct epoch_tracker et; int rstreason; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_PASS_ESTABLISH, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); KASSERT(synqe->flags & TPF_SYNQE, ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe)); CURVNET_SET(lctx->vnet); NET_EPOCH_ENTER(et); /* for syncache_expand */ INP_WLOCK(inp); CTR6(KTR_CXGBE, "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x", __func__, stid, tid, synqe, synqe->flags, inp->inp_flags); ifp = synqe->syn->m_pkthdr.rcvif; vi = if_getsoftc(ifp); KASSERT(vi->adapter == sc, ("%s: vi %p, sc %p mismatch", __func__, vi, sc)); if (__predict_false(inp->inp_flags & INP_DROPPED)) { reset: send_abort_rpl_synqe(TOEDEV(ifp), synqe, CPL_ABORT_SEND_RST); INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } KASSERT(synqe->params.rxq_idx == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0], ("%s: CPL arrived on unexpected rxq. %d %d", __func__, synqe->params.rxq_idx, (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0]))); toep = alloc_toepcb(vi, M_NOWAIT); if (toep == NULL) goto reset; toep->tid = tid; toep->l2te = &sc->l2t->l2tab[synqe->params.l2t_idx]; toep->vnet = lctx->vnet; bcopy(&synqe->params, &toep->params, sizeof(toep->params)); init_toepcb(vi, toep); MPASS(be32toh(cpl->snd_isn) - 1 == synqe->iss); MPASS(be32toh(cpl->rcv_isn) - 1 == synqe->irs); synqe->tcp_opt = cpl->tcp_opt; synqe->toep = toep; /* Come up with something that syncache_expand should be ok with. */ synqe_to_protohdrs(sc, synqe, cpl, &inc, &th, &to); if (inc.inc_flags & INC_ISIPV6) { if (lctx->ce == NULL) { toep->ce = t4_get_clip_entry(sc, &inc.inc6_laddr, true); if (toep->ce == NULL) { free_toepcb(toep); goto reset; /* RST without a CLIP entry? */ } } else { t4_hold_clip_entry(sc, lctx->ce); toep->ce = lctx->ce; } } so = inp->inp_socket; KASSERT(so != NULL, ("%s: socket is NULL", __func__)); rstreason = toe_syncache_expand(&inc, &to, &th, &so); if (rstreason < 0) { free_toepcb(toep); send_abort_rpl_synqe(TOEDEV(ifp), synqe, CPL_ABORT_NO_RST); INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } else if (rstreason == 0 || so == NULL) { free_toepcb(toep); goto reset; } /* New connection inpcb is already locked by syncache_expand(). */ new_inp = sotoinpcb(so); INP_WLOCK_ASSERT(new_inp); MPASS(so->so_vnet == lctx->vnet); /* * This is for expansion from syncookies. * * XXX: we've held the tcbinfo lock throughout so there's no risk of * anyone accept'ing a connection before we've installed our hooks, but * this somewhat defeats the purpose of having a tod_offload_socket :-( */ if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) { tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0); t4_offload_socket(TOEDEV(ifp), synqe, so); } INP_WUNLOCK(new_inp); /* Done with the synqe */ inp = release_synqe(sc, synqe); if (inp != NULL) INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } void t4_init_listen_cpl_handlers(void) { t4_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl); t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl); t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); t4_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish); } void t4_uninit_listen_cpl_handlers(void) { t4_register_cpl_handler(CPL_PASS_OPEN_RPL, NULL); t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, NULL); t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, NULL); t4_register_cpl_handler(CPL_PASS_ESTABLISH, NULL); } #endif diff --git a/sys/dev/cxgbe/tom/t4_tom.c b/sys/dev/cxgbe/tom/t4_tom.c index 58a77ff93c86..9b1dcf304196 100644 --- a/sys/dev/cxgbe/tom/t4_tom.c +++ b/sys/dev/cxgbe/tom/t4_tom.c @@ -1,2302 +1,2314 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_ratelimit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include #include #include #include #include #include #ifdef TCP_OFFLOAD #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "common/t4_tcb.h" #include "t4_clip.h" #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" #include "tom/t4_tls.h" static struct protosw toe_protosw; static struct protosw toe6_protosw; /* Module ops */ static int t4_tom_mod_load(void); static int t4_tom_mod_unload(void); static int t4_tom_modevent(module_t, int, void *); /* ULD ops and helpers */ static int t4_tom_activate(struct adapter *); static int t4_tom_deactivate(struct adapter *); static int t4_tom_stop(struct adapter *); static int t4_tom_restart(struct adapter *); static struct uld_info tom_uld_info = { .uld_activate = t4_tom_activate, .uld_deactivate = t4_tom_deactivate, .uld_stop = t4_tom_stop, .uld_restart = t4_tom_restart, }; static void release_offload_resources(struct toepcb *); static void done_with_toepcb(struct toepcb *); static int alloc_tid_tabs(struct adapter *); static void free_tid_tabs(struct adapter *); static void free_tom_data(struct adapter *, struct tom_data *); static void reclaim_wr_resources(void *, int); static void cleanup_stranded_tids(void *, int); struct toepcb * alloc_toepcb(struct vi_info *vi, int flags) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct toepcb *toep; int tx_credits, txsd_total, len; /* * The firmware counts tx work request credits in units of 16 bytes * each. Reserve room for an ABORT_REQ so the driver never has to worry * about tx credits if it wants to abort a connection. */ tx_credits = sc->params.ofldq_wr_cred; tx_credits -= howmany(sizeof(struct cpl_abort_req), 16); /* * Shortest possible tx work request is a fw_ofld_tx_data_wr + 1 byte * immediate payload, and firmware counts tx work request credits in * units of 16 byte. Calculate the maximum work requests possible. */ txsd_total = tx_credits / howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16); len = offsetof(struct toepcb, txsd) + txsd_total * sizeof(struct ofld_tx_sdesc); toep = malloc(len, M_CXGBE, M_ZERO | flags); if (toep == NULL) return (NULL); refcount_init(&toep->refcount, 1); toep->td = sc->tom_softc; toep->incarnation = sc->incarnation; toep->vi = vi; toep->tid = -1; toep->tx_total = tx_credits; toep->tx_credits = tx_credits; mbufq_init(&toep->ulp_pduq, INT_MAX); mbufq_init(&toep->ulp_pdu_reclaimq, INT_MAX); toep->txsd_total = txsd_total; toep->txsd_avail = txsd_total; toep->txsd_pidx = 0; toep->txsd_cidx = 0; aiotx_init_toep(toep); return (toep); } /* * Initialize a toepcb after its params have been filled out. */ int init_toepcb(struct vi_info *vi, struct toepcb *toep) { struct conn_params *cp = &toep->params; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct tx_cl_rl_params *tc; if (cp->tc_idx >= 0 && cp->tc_idx < sc->params.nsched_cls) { tc = &pi->sched_params->cl_rl[cp->tc_idx]; mtx_lock(&sc->tc_lock); if (tc->state != CS_HW_CONFIGURED) { CH_ERR(vi, "tid %d cannot be bound to traffic class %d " "because it is not configured (its state is %d)\n", toep->tid, cp->tc_idx, tc->state); cp->tc_idx = -1; } else { tc->refcount++; } mtx_unlock(&sc->tc_lock); } toep->ofld_txq = &sc->sge.ofld_txq[cp->txq_idx]; toep->ofld_rxq = &sc->sge.ofld_rxq[cp->rxq_idx]; toep->ctrlq = &sc->sge.ctrlq[pi->port_id]; tls_init_toep(toep); MPASS(ulp_mode(toep) != ULP_MODE_TCPDDP); toep->flags |= TPF_INITIALIZED; return (0); } struct toepcb * hold_toepcb(struct toepcb *toep) { refcount_acquire(&toep->refcount); return (toep); } void free_toepcb(struct toepcb *toep) { if (refcount_release(&toep->refcount) == 0) return; KASSERT(!(toep->flags & TPF_ATTACHED), ("%s: attached to an inpcb", __func__)); KASSERT(!(toep->flags & TPF_CPL_PENDING), ("%s: CPL pending", __func__)); if (toep->flags & TPF_INITIALIZED) { if (ulp_mode(toep) == ULP_MODE_TCPDDP) ddp_uninit_toep(toep); tls_uninit_toep(toep); } free(toep, M_CXGBE); } /* * Set up the socket for TCP offload. */ void offload_socket(struct socket *so, struct toepcb *toep) { struct tom_data *td = toep->td; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct sockbuf *sb; INP_WLOCK_ASSERT(inp); /* Update socket */ sb = &so->so_snd; SOCKBUF_LOCK(sb); sb->sb_flags |= SB_NOCOALESCE; SOCKBUF_UNLOCK(sb); sb = &so->so_rcv; SOCKBUF_LOCK(sb); sb->sb_flags |= SB_NOCOALESCE; if (inp->inp_vflag & INP_IPV6) so->so_proto = &toe6_protosw; else so->so_proto = &toe_protosw; SOCKBUF_UNLOCK(sb); /* Update TCP PCB */ tp->tod = &td->tod; tp->t_toe = toep; tp->t_flags |= TF_TOE; /* Install an extra hold on inp */ toep->inp = inp; toep->flags |= TPF_ATTACHED; in_pcbref(inp); - - /* Add the TOE PCB to the active list */ - mtx_lock(&td->toep_list_lock); - TAILQ_INSERT_HEAD(&td->toep_list, toep, link); - toep->flags |= TPF_IN_TOEP_LIST; - mtx_unlock(&td->toep_list_lock); } void restore_so_proto(struct socket *so, bool v6) { if (v6) so->so_proto = &tcp6_protosw; else so->so_proto = &tcp_protosw; } /* This is _not_ the normal way to "unoffload" a socket. */ void undo_offload_socket(struct socket *so) { struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct toepcb *toep = tp->t_toe; - struct tom_data *td = toep->td; struct sockbuf *sb; INP_WLOCK_ASSERT(inp); sb = &so->so_snd; SOCKBUF_LOCK(sb); sb->sb_flags &= ~SB_NOCOALESCE; SOCKBUF_UNLOCK(sb); sb = &so->so_rcv; SOCKBUF_LOCK(sb); sb->sb_flags &= ~SB_NOCOALESCE; restore_so_proto(so, inp->inp_vflag & INP_IPV6); SOCKBUF_UNLOCK(sb); tp->tod = NULL; tp->t_toe = NULL; tp->t_flags &= ~TF_TOE; toep->inp = NULL; toep->flags &= ~TPF_ATTACHED; if (in_pcbrele_wlocked(inp)) panic("%s: inp freed.", __func__); - - mtx_lock(&td->toep_list_lock); - toep->flags &= ~TPF_IN_TOEP_LIST; - TAILQ_REMOVE(&td->toep_list, toep, link); - mtx_unlock(&td->toep_list_lock); } static void release_offload_resources(struct toepcb *toep) { struct tom_data *td = toep->td; struct adapter *sc = td_adapter(td); int tid = toep->tid; KASSERT(!(toep->flags & TPF_CPL_PENDING), ("%s: %p has CPL pending.", __func__, toep)); CTR5(KTR_CXGBE, "%s: toep %p (tid %d, l2te %p, ce %p)", __func__, toep, tid, toep->l2te, toep->ce); if (toep->l2te) { t4_l2t_release(toep->l2te); toep->l2te = NULL; } if (tid >= 0) { remove_tid(sc, tid, toep->ce ? 2 : 1); release_tid(sc, tid, toep->ctrlq); toep->tid = -1; + mtx_lock(&td->toep_list_lock); + if (toep->flags & TPF_IN_TOEP_LIST) { + toep->flags &= ~TPF_IN_TOEP_LIST; + TAILQ_REMOVE(&td->toep_list, toep, link); + } + mtx_unlock(&td->toep_list_lock); } if (toep->ce) { t4_release_clip_entry(sc, toep->ce); toep->ce = NULL; } if (toep->params.tc_idx != -1) t4_release_cl_rl(sc, toep->vi->pi->port_id, toep->params.tc_idx); } /* * Both the driver and kernel are done with the toepcb. */ static void done_with_toepcb(struct toepcb *toep) { - struct tom_data *td = toep->td; - KASSERT(!(toep->flags & TPF_CPL_PENDING), ("%s: %p has CPL pending.", __func__, toep)); KASSERT(!(toep->flags & TPF_ATTACHED), ("%s: %p is still attached.", __func__, toep)); CTR(KTR_CXGBE, "%s: toep %p (0x%x)", __func__, toep, toep->flags); /* * These queues should have been emptied at approximately the same time * that a normal connection's socket's so_snd would have been purged or * drained. Do _not_ clean up here. */ MPASS(mbufq_empty(&toep->ulp_pduq)); MPASS(mbufq_empty(&toep->ulp_pdu_reclaimq)); #ifdef INVARIANTS if (ulp_mode(toep) == ULP_MODE_TCPDDP) ddp_assert_empty(toep); #endif MPASS(TAILQ_EMPTY(&toep->aiotx_jobq)); MPASS(toep->tid == -1); MPASS(toep->l2te == NULL); MPASS(toep->ce == NULL); - - mtx_lock(&td->toep_list_lock); - if (toep->flags & TPF_IN_TOEP_LIST) { - toep->flags &= ~TPF_IN_TOEP_LIST; - TAILQ_REMOVE(&td->toep_list, toep, link); - } - mtx_unlock(&td->toep_list_lock); + MPASS((toep->flags & TPF_IN_TOEP_LIST) == 0); free_toepcb(toep); } /* * The kernel is done with the TCP PCB and this is our opportunity to unhook the * toepcb hanging off of it. If the TOE driver is also done with the toepcb (no * pending CPL) then it is time to release all resources tied to the toepcb. * * Also gets called when an offloaded active open fails and the TOM wants the * kernel to take the TCP PCB back. */ void t4_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp) { #if defined(KTR) || defined(INVARIANTS) struct inpcb *inp = tptoinpcb(tp); #endif struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(inp); KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); KASSERT(toep->flags & TPF_ATTACHED, ("%s: not attached", __func__)); #ifdef KTR if (tp->t_state == TCPS_SYN_SENT) { CTR6(KTR_CXGBE, "%s: atid %d, toep %p (0x%x), inp %p (0x%x)", __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags); } else { CTR6(KTR_CXGBE, "t4_pcb_detach: tid %d (%s), toep %p (0x%x), inp %p (0x%x)", toep->tid, tcpstates[tp->t_state], toep, toep->flags, inp, inp->inp_flags); } #endif tp->tod = NULL; tp->t_toe = NULL; tp->t_flags &= ~TF_TOE; toep->flags &= ~TPF_ATTACHED; if (!(toep->flags & TPF_CPL_PENDING)) done_with_toepcb(toep); } /* * setsockopt handler. */ static void t4_ctloutput(struct toedev *tod, struct tcpcb *tp, int dir, int name) { struct adapter *sc = tod->tod_softc; struct toepcb *toep = tp->t_toe; if (dir == SOPT_GET) return; CTR4(KTR_CXGBE, "%s: tp %p, dir %u, name %u", __func__, tp, dir, name); switch (name) { case TCP_NODELAY: if (tp->t_state != TCPS_ESTABLISHED) break; toep->params.nagle = tp->t_flags & TF_NODELAY ? 0 : 1; t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS, V_TF_NAGLE(1), V_TF_NAGLE(toep->params.nagle), 0, 0); break; default: break; } } static inline uint64_t get_tcb_tflags(const uint64_t *tcb) { return ((be64toh(tcb[14]) << 32) | (be64toh(tcb[15]) >> 32)); } static inline uint32_t get_tcb_field(const uint64_t *tcb, u_int word, uint32_t mask, u_int shift) { #define LAST_WORD ((TCB_SIZE / 4) - 1) uint64_t t1, t2; int flit_idx; MPASS(mask != 0); MPASS(word <= LAST_WORD); MPASS(shift < 32); flit_idx = (LAST_WORD - word) / 2; if (word & 0x1) shift += 32; t1 = be64toh(tcb[flit_idx]) >> shift; t2 = 0; if (fls(mask) > 64 - shift) { /* * Will spill over into the next logical flit, which is the flit * before this one. The flit_idx before this one must be valid. */ MPASS(flit_idx > 0); t2 = be64toh(tcb[flit_idx - 1]) << (64 - shift); } return ((t2 | t1) & mask); #undef LAST_WORD } #define GET_TCB_FIELD(tcb, F) \ get_tcb_field(tcb, W_TCB_##F, M_TCB_##F, S_TCB_##F) /* * Issues a CPL_GET_TCB to read the entire TCB for the tid. */ static int send_get_tcb(struct adapter *sc, u_int tid) { struct cpl_get_tcb *cpl; struct wrq_cookie cookie; MPASS(tid >= sc->tids.tid_base); MPASS(tid - sc->tids.tid_base < sc->tids.ntids); cpl = start_wrq_wr(&sc->sge.ctrlq[0], howmany(sizeof(*cpl), 16), &cookie); if (__predict_false(cpl == NULL)) return (ENOMEM); bzero(cpl, sizeof(*cpl)); INIT_TP_WR(cpl, tid); OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_GET_TCB, tid)); cpl->reply_ctrl = htobe16(V_REPLY_CHAN(0) | V_QUEUENO(sc->sge.ofld_rxq[0].iq.cntxt_id)); cpl->cookie = 0xff; commit_wrq_wr(&sc->sge.ctrlq[0], cpl, &cookie); return (0); } static struct tcb_histent * alloc_tcb_histent(struct adapter *sc, u_int tid, int flags) { struct tcb_histent *te; MPASS(flags == M_NOWAIT || flags == M_WAITOK); te = malloc(sizeof(*te), M_CXGBE, M_ZERO | flags); if (te == NULL) return (NULL); mtx_init(&te->te_lock, "TCB entry", NULL, MTX_DEF); callout_init_mtx(&te->te_callout, &te->te_lock, 0); te->te_adapter = sc; te->te_tid = tid; return (te); } static void free_tcb_histent(struct tcb_histent *te) { mtx_destroy(&te->te_lock); free(te, M_CXGBE); } /* * Start tracking the tid in the TCB history. */ int add_tid_to_history(struct adapter *sc, u_int tid) { struct tcb_histent *te = NULL; struct tom_data *td = sc->tom_softc; int rc; MPASS(tid >= sc->tids.tid_base); MPASS(tid - sc->tids.tid_base < sc->tids.ntids); if (td->tcb_history == NULL) return (ENXIO); rw_wlock(&td->tcb_history_lock); if (td->tcb_history[tid] != NULL) { rc = EEXIST; goto done; } te = alloc_tcb_histent(sc, tid, M_NOWAIT); if (te == NULL) { rc = ENOMEM; goto done; } mtx_lock(&te->te_lock); rc = send_get_tcb(sc, tid); if (rc == 0) { te->te_flags |= TE_RPL_PENDING; td->tcb_history[tid] = te; } else { free(te, M_CXGBE); } mtx_unlock(&te->te_lock); done: rw_wunlock(&td->tcb_history_lock); return (rc); } static void remove_tcb_histent(struct tcb_histent *te) { struct adapter *sc = te->te_adapter; struct tom_data *td = sc->tom_softc; rw_assert(&td->tcb_history_lock, RA_WLOCKED); mtx_assert(&te->te_lock, MA_OWNED); MPASS(td->tcb_history[te->te_tid] == te); td->tcb_history[te->te_tid] = NULL; free_tcb_histent(te); rw_wunlock(&td->tcb_history_lock); } static inline struct tcb_histent * lookup_tcb_histent(struct adapter *sc, u_int tid, bool addrem) { struct tcb_histent *te; struct tom_data *td = sc->tom_softc; MPASS(tid >= sc->tids.tid_base); MPASS(tid - sc->tids.tid_base < sc->tids.ntids); if (td->tcb_history == NULL) return (NULL); if (addrem) rw_wlock(&td->tcb_history_lock); else rw_rlock(&td->tcb_history_lock); te = td->tcb_history[tid]; if (te != NULL) { mtx_lock(&te->te_lock); return (te); /* with both locks held */ } if (addrem) rw_wunlock(&td->tcb_history_lock); else rw_runlock(&td->tcb_history_lock); return (te); } static inline void release_tcb_histent(struct tcb_histent *te) { struct adapter *sc = te->te_adapter; struct tom_data *td = sc->tom_softc; mtx_assert(&te->te_lock, MA_OWNED); mtx_unlock(&te->te_lock); rw_assert(&td->tcb_history_lock, RA_RLOCKED); rw_runlock(&td->tcb_history_lock); } static void request_tcb(void *arg) { struct tcb_histent *te = arg; mtx_assert(&te->te_lock, MA_OWNED); /* Noone else is supposed to update the histent. */ MPASS(!(te->te_flags & TE_RPL_PENDING)); if (send_get_tcb(te->te_adapter, te->te_tid) == 0) te->te_flags |= TE_RPL_PENDING; else callout_schedule(&te->te_callout, hz / 100); } static void update_tcb_histent(struct tcb_histent *te, const uint64_t *tcb) { struct tom_data *td = te->te_adapter->tom_softc; uint64_t tflags = get_tcb_tflags(tcb); uint8_t sample = 0; if (GET_TCB_FIELD(tcb, SND_MAX_RAW) != GET_TCB_FIELD(tcb, SND_UNA_RAW)) { if (GET_TCB_FIELD(tcb, T_RXTSHIFT) != 0) sample |= TS_RTO; if (GET_TCB_FIELD(tcb, T_DUPACKS) != 0) sample |= TS_DUPACKS; if (GET_TCB_FIELD(tcb, T_DUPACKS) >= td->dupack_threshold) sample |= TS_FASTREXMT; } if (GET_TCB_FIELD(tcb, SND_MAX_RAW) != 0) { uint32_t snd_wnd; sample |= TS_SND_BACKLOGGED; /* for whatever reason. */ snd_wnd = GET_TCB_FIELD(tcb, RCV_ADV); if (tflags & V_TF_RECV_SCALE(1)) snd_wnd <<= GET_TCB_FIELD(tcb, RCV_SCALE); if (GET_TCB_FIELD(tcb, SND_CWND) < snd_wnd) sample |= TS_CWND_LIMITED; /* maybe due to CWND */ } if (tflags & V_TF_CCTRL_ECN(1)) { /* * CE marker on incoming IP hdr, echoing ECE back in the TCP * hdr. Indicates congestion somewhere on the way from the peer * to this node. */ if (tflags & V_TF_CCTRL_ECE(1)) sample |= TS_ECN_ECE; /* * ECE seen and CWR sent (or about to be sent). Might indicate * congestion on the way to the peer. This node is reducing its * congestion window in response. */ if (tflags & (V_TF_CCTRL_CWR(1) | V_TF_CCTRL_RFR(1))) sample |= TS_ECN_CWR; } te->te_sample[te->te_pidx] = sample; if (++te->te_pidx == nitems(te->te_sample)) te->te_pidx = 0; memcpy(te->te_tcb, tcb, TCB_SIZE); te->te_flags |= TE_ACTIVE; } static int do_get_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_get_tcb_rpl *cpl = mtod(m, const void *); const uint64_t *tcb = (const uint64_t *)(const void *)(cpl + 1); struct tcb_histent *te; const u_int tid = GET_TID(cpl); bool remove; remove = GET_TCB_FIELD(tcb, T_STATE) == TCPS_CLOSED; te = lookup_tcb_histent(sc, tid, remove); if (te == NULL) { /* Not in the history. Who issued the GET_TCB for this? */ device_printf(sc->dev, "tcb %u: flags 0x%016jx, state %u, " "srtt %u, sscale %u, rscale %u, cookie 0x%x\n", tid, (uintmax_t)get_tcb_tflags(tcb), GET_TCB_FIELD(tcb, T_STATE), GET_TCB_FIELD(tcb, T_SRTT), GET_TCB_FIELD(tcb, SND_SCALE), GET_TCB_FIELD(tcb, RCV_SCALE), cpl->cookie); goto done; } MPASS(te->te_flags & TE_RPL_PENDING); te->te_flags &= ~TE_RPL_PENDING; if (remove) { remove_tcb_histent(te); } else { update_tcb_histent(te, tcb); callout_reset(&te->te_callout, hz / 10, request_tcb, te); release_tcb_histent(te); } done: m_freem(m); return (0); } static void fill_tcp_info_from_tcb(struct adapter *sc, uint64_t *tcb, struct tcp_info *ti) { uint32_t v; ti->tcpi_state = GET_TCB_FIELD(tcb, T_STATE); v = GET_TCB_FIELD(tcb, T_SRTT); ti->tcpi_rtt = tcp_ticks_to_us(sc, v); v = GET_TCB_FIELD(tcb, T_RTTVAR); ti->tcpi_rttvar = tcp_ticks_to_us(sc, v); ti->tcpi_snd_ssthresh = GET_TCB_FIELD(tcb, SND_SSTHRESH); ti->tcpi_snd_cwnd = GET_TCB_FIELD(tcb, SND_CWND); ti->tcpi_rcv_nxt = GET_TCB_FIELD(tcb, RCV_NXT); ti->tcpi_rcv_adv = GET_TCB_FIELD(tcb, RCV_ADV); ti->tcpi_dupacks = GET_TCB_FIELD(tcb, T_DUPACKS); v = GET_TCB_FIELD(tcb, TX_MAX); ti->tcpi_snd_nxt = v - GET_TCB_FIELD(tcb, SND_NXT_RAW); ti->tcpi_snd_una = v - GET_TCB_FIELD(tcb, SND_UNA_RAW); ti->tcpi_snd_max = v - GET_TCB_FIELD(tcb, SND_MAX_RAW); /* Receive window being advertised by us. */ ti->tcpi_rcv_wscale = GET_TCB_FIELD(tcb, SND_SCALE); /* Yes, SND. */ ti->tcpi_rcv_space = GET_TCB_FIELD(tcb, RCV_WND); /* Send window */ ti->tcpi_snd_wscale = GET_TCB_FIELD(tcb, RCV_SCALE); /* Yes, RCV. */ ti->tcpi_snd_wnd = GET_TCB_FIELD(tcb, RCV_ADV); if (get_tcb_tflags(tcb) & V_TF_RECV_SCALE(1)) ti->tcpi_snd_wnd <<= ti->tcpi_snd_wscale; else ti->tcpi_snd_wscale = 0; } static void fill_tcp_info_from_history(struct adapter *sc, struct tcb_histent *te, struct tcp_info *ti) { fill_tcp_info_from_tcb(sc, te->te_tcb, ti); } /* * Reads the TCB for the given tid using a memory window and copies it to 'buf' * in the same format as CPL_GET_TCB_RPL. */ static void read_tcb_using_memwin(struct adapter *sc, u_int tid, uint64_t *buf) { int i, j, k, rc; uint32_t addr; u_char *tcb, tmp; MPASS(tid >= sc->tids.tid_base); MPASS(tid - sc->tids.tid_base < sc->tids.ntids); addr = t4_read_reg(sc, A_TP_CMM_TCB_BASE) + tid * TCB_SIZE; rc = read_via_memwin(sc, 2, addr, (uint32_t *)buf, TCB_SIZE); if (rc != 0) return; tcb = (u_char *)buf; for (i = 0, j = TCB_SIZE - 16; i < j; i += 16, j -= 16) { for (k = 0; k < 16; k++) { tmp = tcb[i + k]; tcb[i + k] = tcb[j + k]; tcb[j + k] = tmp; } } } static void fill_tcp_info(struct adapter *sc, u_int tid, struct tcp_info *ti) { uint64_t tcb[TCB_SIZE / sizeof(uint64_t)]; struct tcb_histent *te; ti->tcpi_toe_tid = tid; te = lookup_tcb_histent(sc, tid, false); if (te != NULL) { fill_tcp_info_from_history(sc, te, ti); release_tcb_histent(te); } else { if (!(sc->debug_flags & DF_DISABLE_TCB_CACHE)) { /* XXX: tell firmware to flush TCB cache. */ } read_tcb_using_memwin(sc, tid, tcb); fill_tcp_info_from_tcb(sc, tcb, ti); } } /* * Called by the kernel to allow the TOE driver to "refine" values filled up in * the tcp_info for an offloaded connection. */ static void t4_tcp_info(struct toedev *tod, const struct tcpcb *tp, struct tcp_info *ti) { struct adapter *sc = tod->tod_softc; struct toepcb *toep = tp->t_toe; INP_LOCK_ASSERT(tptoinpcb(tp)); MPASS(ti != NULL); fill_tcp_info(sc, toep->tid, ti); } #ifdef KERN_TLS static int t4_alloc_tls_session(struct toedev *tod, struct tcpcb *tp, struct ktls_session *tls, int direction) { struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(tptoinpcb(tp)); MPASS(tls != NULL); return (tls_alloc_ktls(toep, tls, direction)); } #endif static void send_mss_flowc_wr(struct adapter *sc, struct toepcb *toep) { struct wrq_cookie cookie; struct fw_flowc_wr *flowc; struct ofld_tx_sdesc *txsd; const int flowclen = sizeof(*flowc) + sizeof(struct fw_flowc_mnemval); const int flowclen16 = howmany(flowclen, 16); if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0) { CH_ERR(sc, "%s: tid %u out of tx credits (%d, %d).\n", __func__, toep->tid, toep->tx_credits, toep->txsd_avail); return; } flowc = start_wrq_wr(&toep->ofld_txq->wrq, flowclen16, &cookie); if (__predict_false(flowc == NULL)) { CH_ERR(sc, "ENOMEM in %s for tid %u.\n", __func__, toep->tid); return; } flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | V_FW_FLOWC_WR_NPARAMS(1)); flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) | V_FW_WR_FLOWID(toep->tid)); flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_MSS; flowc->mnemval[0].val = htobe32(toep->params.emss); txsd = &toep->txsd[toep->txsd_pidx]; txsd->tx_credits = flowclen16; txsd->plen = 0; toep->tx_credits -= txsd->tx_credits; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) toep->txsd_pidx = 0; toep->txsd_avail--; commit_wrq_wr(&toep->ofld_txq->wrq, flowc, &cookie); } static void t4_pmtu_update(struct toedev *tod, struct tcpcb *tp, tcp_seq seq, int mtu) { struct work_request_hdr *wrh; struct ulp_txpkt *ulpmc; int idx, len; struct wrq_cookie cookie; struct inpcb *inp = tptoinpcb(tp); struct toepcb *toep = tp->t_toe; struct adapter *sc = td_adapter(toep->td); unsigned short *mtus = &sc->params.mtus[0]; INP_WLOCK_ASSERT(inp); MPASS(mtu > 0); /* kernel is supposed to provide something usable. */ /* tp->snd_una and snd_max are in host byte order too. */ seq = be32toh(seq); CTR6(KTR_CXGBE, "%s: tid %d, seq 0x%08x, mtu %u, mtu_idx %u (%d)", __func__, toep->tid, seq, mtu, toep->params.mtu_idx, mtus[toep->params.mtu_idx]); if (ulp_mode(toep) == ULP_MODE_NONE && /* XXX: Read TCB otherwise? */ (SEQ_LT(seq, tp->snd_una) || SEQ_GEQ(seq, tp->snd_max))) { CTR5(KTR_CXGBE, "%s: tid %d, seq 0x%08x not in range [0x%08x, 0x%08x).", __func__, toep->tid, seq, tp->snd_una, tp->snd_max); return; } /* Find the best mtu_idx for the suggested MTU. */ for (idx = 0; idx < NMTUS - 1 && mtus[idx + 1] <= mtu; idx++) continue; if (idx >= toep->params.mtu_idx) return; /* Never increase the PMTU (just like the kernel). */ /* * We'll send a compound work request with 2 SET_TCB_FIELDs -- the first * one updates the mtu_idx and the second one triggers a retransmit. */ len = sizeof(*wrh) + 2 * roundup2(LEN__SET_TCB_FIELD_ULP, 16); wrh = start_wrq_wr(toep->ctrlq, howmany(len, 16), &cookie); if (wrh == NULL) { CH_ERR(sc, "failed to change mtu_idx of tid %d (%u -> %u).\n", toep->tid, toep->params.mtu_idx, idx); return; } INIT_ULPTX_WRH(wrh, len, 1, 0); /* atomic */ ulpmc = (struct ulp_txpkt *)(wrh + 1); ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_T_MAXSEG, V_TCB_T_MAXSEG(M_TCB_T_MAXSEG), V_TCB_T_MAXSEG(idx)); ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_TIMESTAMP, V_TCB_TIMESTAMP(0x7FFFFULL << 11), 0); commit_wrq_wr(toep->ctrlq, wrh, &cookie); /* Update the software toepcb and tcpcb. */ toep->params.mtu_idx = idx; tp->t_maxseg = mtus[toep->params.mtu_idx]; if (inp->inp_inc.inc_flags & INC_ISIPV6) tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr); toep->params.emss = tp->t_maxseg; if (tp->t_flags & TF_RCVD_TSTMP) toep->params.emss -= TCPOLEN_TSTAMP_APPA; /* Update the firmware flowc. */ send_mss_flowc_wr(sc, toep); /* Update the MTU in the kernel's hostcache. */ if (sc->tt.update_hc_on_pmtu_change != 0) { struct in_conninfo inc = {0}; inc.inc_fibnum = inp->inp_inc.inc_fibnum; if (inp->inp_inc.inc_flags & INC_ISIPV6) { inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = inp->inp_inc.inc6_faddr; } else { inc.inc_faddr = inp->inp_inc.inc_faddr; } tcp_hc_updatemtu(&inc, mtu); } CTR6(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u), t_maxseg %u, emss %u", __func__, toep->tid, toep->params.mtu_idx, mtus[toep->params.mtu_idx], tp->t_maxseg, toep->params.emss); } /* * The TOE driver will not receive any more CPLs for the tid associated with the * toepcb; release the hold on the inpcb. */ void final_cpl_received(struct toepcb *toep) { struct inpcb *inp = toep->inp; bool need_wakeup; KASSERT(inp != NULL, ("%s: inp is NULL", __func__)); INP_WLOCK_ASSERT(inp); KASSERT(toep->flags & TPF_CPL_PENDING, ("%s: CPL not pending already?", __func__)); CTR6(KTR_CXGBE, "%s: tid %d, toep %p (0x%x), inp %p (0x%x)", __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags); if (ulp_mode(toep) == ULP_MODE_TCPDDP) release_ddp_resources(toep); toep->inp = NULL; need_wakeup = (toep->flags & TPF_WAITING_FOR_FINAL) != 0; toep->flags &= ~(TPF_CPL_PENDING | TPF_WAITING_FOR_FINAL); mbufq_drain(&toep->ulp_pduq); mbufq_drain(&toep->ulp_pdu_reclaimq); release_offload_resources(toep); if (!(toep->flags & TPF_ATTACHED)) done_with_toepcb(toep); if (!in_pcbrele_wlocked(inp)) INP_WUNLOCK(inp); if (need_wakeup) { struct mtx *lock = mtx_pool_find(mtxpool_sleep, toep); mtx_lock(lock); wakeup(toep); mtx_unlock(lock); } } void insert_tid(struct adapter *sc, int tid, void *ctx, int ntids) { struct tid_info *t = &sc->tids; MPASS(tid >= t->tid_base); MPASS(tid - t->tid_base < t->ntids); t->tid_tab[tid - t->tid_base] = ctx; atomic_add_int(&t->tids_in_use, ntids); } void * lookup_tid(struct adapter *sc, int tid) { struct tid_info *t = &sc->tids; return (t->tid_tab[tid - t->tid_base]); } void update_tid(struct adapter *sc, int tid, void *ctx) { struct tid_info *t = &sc->tids; t->tid_tab[tid - t->tid_base] = ctx; } void remove_tid(struct adapter *sc, int tid, int ntids) { struct tid_info *t = &sc->tids; t->tid_tab[tid - t->tid_base] = NULL; atomic_subtract_int(&t->tids_in_use, ntids); } /* * What mtu_idx to use, given a 4-tuple. Note that both s->mss and tcp_mssopt * have the MSS that we should advertise in our SYN. Advertised MSS doesn't * account for any TCP options so the effective MSS (only payload, no headers or * options) could be different. */ static int find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, struct offload_settings *s) { unsigned short *mtus = &sc->params.mtus[0]; int i, mss, mtu; MPASS(inc != NULL); mss = s->mss > 0 ? s->mss : tcp_mssopt(inc); if (inc->inc_flags & INC_ISIPV6) mtu = mss + sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else mtu = mss + sizeof(struct ip) + sizeof(struct tcphdr); for (i = 0; i < NMTUS - 1 && mtus[i + 1] <= mtu; i++) continue; return (i); } /* * Determine the receive window size for a socket. */ u_long select_rcv_wnd(struct socket *so) { unsigned long wnd; SOCKBUF_LOCK_ASSERT(&so->so_rcv); wnd = sbspace(&so->so_rcv); if (wnd < MIN_RCV_WND) wnd = MIN_RCV_WND; return min(wnd, MAX_RCV_WND); } int select_rcv_wscale(void) { int wscale = 0; unsigned long space = sb_max; if (space > MAX_RCV_WND) space = MAX_RCV_WND; while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space) wscale++; return (wscale); } __be64 calc_options0(struct vi_info *vi, struct conn_params *cp) { uint64_t opt0 = 0; opt0 |= F_TCAM_BYPASS; MPASS(cp->wscale >= 0 && cp->wscale <= M_WND_SCALE); opt0 |= V_WND_SCALE(cp->wscale); MPASS(cp->mtu_idx >= 0 && cp->mtu_idx < NMTUS); opt0 |= V_MSS_IDX(cp->mtu_idx); MPASS(cp->ulp_mode >= 0 && cp->ulp_mode <= M_ULP_MODE); opt0 |= V_ULP_MODE(cp->ulp_mode); MPASS(cp->opt0_bufsize >= 0 && cp->opt0_bufsize <= M_RCV_BUFSIZ); opt0 |= V_RCV_BUFSIZ(cp->opt0_bufsize); MPASS(cp->l2t_idx >= 0 && cp->l2t_idx < vi->adapter->vres.l2t.size); opt0 |= V_L2T_IDX(cp->l2t_idx); opt0 |= V_SMAC_SEL(vi->smt_idx); opt0 |= V_TX_CHAN(vi->pi->tx_chan); MPASS(cp->keepalive == 0 || cp->keepalive == 1); opt0 |= V_KEEP_ALIVE(cp->keepalive); MPASS(cp->nagle == 0 || cp->nagle == 1); opt0 |= V_NAGLE(cp->nagle); return (htobe64(opt0)); } __be32 calc_options2(struct vi_info *vi, struct conn_params *cp) { uint32_t opt2 = 0; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; /* * rx flow control, rx coalesce, congestion control, and tx pace are all * explicitly set by the driver. On T5+ the ISS is also set by the * driver to the value picked by the kernel. */ if (is_t4(sc)) { opt2 |= F_RX_FC_VALID | F_RX_COALESCE_VALID; opt2 |= F_CONG_CNTRL_VALID | F_PACE_VALID; } else { opt2 |= F_T5_OPT_2_VALID; /* all 4 valid */ opt2 |= F_T5_ISS; /* ISS provided in CPL */ } MPASS(cp->sack == 0 || cp->sack == 1); opt2 |= V_SACK_EN(cp->sack); MPASS(cp->tstamp == 0 || cp->tstamp == 1); opt2 |= V_TSTAMPS_EN(cp->tstamp); if (cp->wscale > 0) opt2 |= F_WND_SCALE_EN; MPASS(cp->ecn == 0 || cp->ecn == 1); opt2 |= V_CCTRL_ECN(cp->ecn); opt2 |= V_TX_QUEUE(TX_MODQ(pi->tx_chan)); opt2 |= V_PACE(0); opt2 |= F_RSS_QUEUE_VALID; opt2 |= V_RSS_QUEUE(sc->sge.ofld_rxq[cp->rxq_idx].iq.abs_id); if (chip_id(sc) <= CHELSIO_T6) { MPASS(pi->rx_chan == 0 || pi->rx_chan == 1); opt2 |= V_RX_CHANNEL(pi->rx_chan); } MPASS(cp->cong_algo >= 0 && cp->cong_algo <= M_CONG_CNTRL); opt2 |= V_CONG_CNTRL(cp->cong_algo); MPASS(cp->rx_coalesce == 0 || cp->rx_coalesce == 1); if (cp->rx_coalesce == 1) opt2 |= V_RX_COALESCE(M_RX_COALESCE); opt2 |= V_RX_FC_DDP(0) | V_RX_FC_DISABLE(0); MPASS(cp->ulp_mode != ULP_MODE_TCPDDP); return (htobe32(opt2)); } uint64_t select_ntuple(struct vi_info *vi, struct l2t_entry *e) { struct adapter *sc = vi->adapter; struct tp_params *tp = &sc->params.tp; uint64_t ntuple = 0; /* * Initialize each of the fields which we care about which are present * in the Compressed Filter Tuple. */ if (tp->vlan_shift >= 0 && EVL_VLANOFTAG(e->vlan) != CPL_L2T_VLAN_NONE) ntuple |= (uint64_t)(F_FT_VLAN_VLD | e->vlan) << tp->vlan_shift; if (tp->port_shift >= 0) ntuple |= (uint64_t)e->lport << tp->port_shift; if (tp->protocol_shift >= 0) ntuple |= (uint64_t)IPPROTO_TCP << tp->protocol_shift; if (tp->vnic_shift >= 0 && tp->vnic_mode == FW_VNIC_MODE_PF_VF) { ntuple |= (uint64_t)(V_FT_VNID_ID_VF(vi->vin) | V_FT_VNID_ID_PF(sc->pf) | V_FT_VNID_ID_VLD(vi->vfvld)) << tp->vnic_shift; } if (is_t4(sc)) return (htobe32((uint32_t)ntuple)); else return (htobe64(V_FILTER_TUPLE(ntuple))); } /* * Initialize various connection parameters. */ void init_conn_params(struct vi_info *vi , struct offload_settings *s, struct in_conninfo *inc, struct socket *so, const struct tcp_options *tcpopt, int16_t l2t_idx, struct conn_params *cp) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct tom_tunables *tt = &sc->tt; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); u_long wnd; u_int q_idx; MPASS(s->offload != 0); /* Congestion control algorithm */ if (s->cong_algo >= 0) cp->cong_algo = s->cong_algo & M_CONG_CNTRL; else if (sc->tt.cong_algorithm >= 0) cp->cong_algo = tt->cong_algorithm & M_CONG_CNTRL; else { struct cc_algo *cc = CC_ALGO(tp); if (strcasecmp(cc->name, "reno") == 0) cp->cong_algo = CONG_ALG_RENO; else if (strcasecmp(cc->name, "tahoe") == 0) cp->cong_algo = CONG_ALG_TAHOE; if (strcasecmp(cc->name, "newreno") == 0) cp->cong_algo = CONG_ALG_NEWRENO; if (strcasecmp(cc->name, "highspeed") == 0) cp->cong_algo = CONG_ALG_HIGHSPEED; else { /* * Use newreno in case the algorithm selected by the * host stack is not supported by the hardware. */ cp->cong_algo = CONG_ALG_NEWRENO; } } /* Tx traffic scheduling class. */ if (s->sched_class >= 0 && s->sched_class < sc->params.nsched_cls) cp->tc_idx = s->sched_class; else cp->tc_idx = -1; /* Nagle's algorithm. */ if (s->nagle >= 0) cp->nagle = s->nagle > 0 ? 1 : 0; else cp->nagle = tp->t_flags & TF_NODELAY ? 0 : 1; /* TCP Keepalive. */ if (V_tcp_always_keepalive || so_options_get(so) & SO_KEEPALIVE) cp->keepalive = 1; else cp->keepalive = 0; /* Optimization that's specific to T5 @ 40G. */ if (tt->tx_align >= 0) cp->tx_align = tt->tx_align > 0 ? 1 : 0; else if (chip_id(sc) == CHELSIO_T5 && (port_top_speed(pi) > 10 || sc->params.nports > 2)) cp->tx_align = 1; else cp->tx_align = 0; /* ULP mode. */ cp->ulp_mode = ULP_MODE_NONE; /* Rx coalescing. */ if (s->rx_coalesce >= 0) cp->rx_coalesce = s->rx_coalesce > 0 ? 1 : 0; else if (tt->rx_coalesce >= 0) cp->rx_coalesce = tt->rx_coalesce > 0 ? 1 : 0; else cp->rx_coalesce = 1; /* default */ /* * Index in the PMTU table. This controls the MSS that we announce in * our SYN initially, but after ESTABLISHED it controls the MSS that we * use to send data. */ cp->mtu_idx = find_best_mtu_idx(sc, inc, s); /* Tx queue for this connection. */ if (s->txq == QUEUE_RANDOM) q_idx = arc4random(); else if (s->txq == QUEUE_ROUNDROBIN) q_idx = atomic_fetchadd_int(&vi->txq_rr, 1); else q_idx = s->txq; cp->txq_idx = vi->first_ofld_txq + q_idx % vi->nofldtxq; /* Rx queue for this connection. */ if (s->rxq == QUEUE_RANDOM) q_idx = arc4random(); else if (s->rxq == QUEUE_ROUNDROBIN) q_idx = atomic_fetchadd_int(&vi->rxq_rr, 1); else q_idx = s->rxq; cp->rxq_idx = vi->first_ofld_rxq + q_idx % vi->nofldrxq; if (SOLISTENING(so)) { /* Passive open */ MPASS(tcpopt != NULL); /* TCP timestamp option */ if (tcpopt->tstamp && (s->tstamp > 0 || (s->tstamp < 0 && V_tcp_do_rfc1323))) cp->tstamp = 1; else cp->tstamp = 0; /* SACK */ if (tcpopt->sack && (s->sack > 0 || (s->sack < 0 && V_tcp_do_sack))) cp->sack = 1; else cp->sack = 0; /* Receive window scaling. */ if (tcpopt->wsf > 0 && tcpopt->wsf < 15 && V_tcp_do_rfc1323) cp->wscale = select_rcv_wscale(); else cp->wscale = 0; /* ECN */ if (tcpopt->ecn && /* XXX: review. */ (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn))) cp->ecn = 1; else cp->ecn = 0; wnd = max(so->sol_sbrcv_hiwat, MIN_RCV_WND); cp->opt0_bufsize = min(wnd >> 10, M_RCV_BUFSIZ); if (tt->sndbuf > 0) cp->sndbuf = tt->sndbuf; else if (so->sol_sbsnd_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf) cp->sndbuf = 256 * 1024; else cp->sndbuf = so->sol_sbsnd_hiwat; } else { /* Active open */ /* TCP timestamp option */ if (s->tstamp > 0 || (s->tstamp < 0 && (tp->t_flags & TF_REQ_TSTMP))) cp->tstamp = 1; else cp->tstamp = 0; /* SACK */ if (s->sack > 0 || (s->sack < 0 && (tp->t_flags & TF_SACK_PERMIT))) cp->sack = 1; else cp->sack = 0; /* Receive window scaling */ if (tp->t_flags & TF_REQ_SCALE) cp->wscale = select_rcv_wscale(); else cp->wscale = 0; /* ECN */ if (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn == 1)) cp->ecn = 1; else cp->ecn = 0; SOCKBUF_LOCK(&so->so_rcv); wnd = max(select_rcv_wnd(so), MIN_RCV_WND); SOCKBUF_UNLOCK(&so->so_rcv); cp->opt0_bufsize = min(wnd >> 10, M_RCV_BUFSIZ); if (tt->sndbuf > 0) cp->sndbuf = tt->sndbuf; else { SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf) cp->sndbuf = 256 * 1024; else cp->sndbuf = so->so_snd.sb_hiwat; SOCKBUF_UNLOCK(&so->so_snd); } } cp->l2t_idx = l2t_idx; /* This will be initialized on ESTABLISHED. */ cp->emss = 0; } int negative_advice(int status) { return (status == CPL_ERR_RTX_NEG_ADVICE || status == CPL_ERR_PERSIST_NEG_ADVICE || status == CPL_ERR_KEEPALV_NEG_ADVICE); } static int alloc_tid_tab(struct adapter *sc) { struct tid_info *t = &sc->tids; MPASS(t->ntids > 0); MPASS(t->tid_tab == NULL); t->tid_tab = malloc(t->ntids * sizeof(*t->tid_tab), M_CXGBE, M_ZERO | M_NOWAIT); if (t->tid_tab == NULL) return (ENOMEM); atomic_store_rel_int(&t->tids_in_use, 0); return (0); } static void free_tid_tab(struct adapter *sc) { struct tid_info *t = &sc->tids; KASSERT(t->tids_in_use == 0, ("%s: %d tids still in use.", __func__, t->tids_in_use)); free(t->tid_tab, M_CXGBE); t->tid_tab = NULL; } static void free_tid_tabs(struct adapter *sc) { free_tid_tab(sc); free_stid_tab(sc); } static int alloc_tid_tabs(struct adapter *sc) { int rc; rc = alloc_tid_tab(sc); if (rc != 0) goto failed; rc = alloc_stid_tab(sc); if (rc != 0) goto failed; return (0); failed: free_tid_tabs(sc); return (rc); } static inline void alloc_tcb_history(struct adapter *sc, struct tom_data *td) { if (sc->tids.ntids == 0 || sc->tids.ntids > 1024) return; rw_init(&td->tcb_history_lock, "TCB history"); td->tcb_history = malloc(sc->tids.ntids * sizeof(*td->tcb_history), M_CXGBE, M_ZERO | M_NOWAIT); td->dupack_threshold = G_DUPACKTHRESH(t4_read_reg(sc, A_TP_PARA_REG0)); } static inline void free_tcb_history(struct adapter *sc, struct tom_data *td) { #ifdef INVARIANTS int i; if (td->tcb_history != NULL) { for (i = 0; i < sc->tids.ntids; i++) { MPASS(td->tcb_history[i] == NULL); } } #endif free(td->tcb_history, M_CXGBE); if (rw_initialized(&td->tcb_history_lock)) rw_destroy(&td->tcb_history_lock); } static void free_tom_data(struct adapter *sc, struct tom_data *td) { ASSERT_SYNCHRONIZED_OP(sc); KASSERT(TAILQ_EMPTY(&td->toep_list), ("%s: TOE PCB list is not empty.", __func__)); KASSERT(td->lctx_count == 0, ("%s: lctx hash table is not empty.", __func__)); t4_free_ppod_region(&td->pr); if (td->listen_mask != 0) hashdestroy(td->listen_hash, M_CXGBE, td->listen_mask); if (mtx_initialized(&td->unsent_wr_lock)) mtx_destroy(&td->unsent_wr_lock); if (mtx_initialized(&td->lctx_hash_lock)) mtx_destroy(&td->lctx_hash_lock); if (mtx_initialized(&td->toep_list_lock)) mtx_destroy(&td->toep_list_lock); free_tcb_history(sc, td); free_tid_tabs(sc); free(td, M_CXGBE); } static char * prepare_pkt(int open_type, uint16_t vtag, struct inpcb *inp, int *pktlen, int *buflen) { char *pkt; struct tcphdr *th; int ipv6, len; const int maxlen = max(sizeof(struct ether_header), sizeof(struct ether_vlan_header)) + max(sizeof(struct ip), sizeof(struct ip6_hdr)) + sizeof(struct tcphdr); MPASS(open_type == OPEN_TYPE_ACTIVE || open_type == OPEN_TYPE_LISTEN); pkt = malloc(maxlen, M_CXGBE, M_ZERO | M_NOWAIT); if (pkt == NULL) return (NULL); ipv6 = inp->inp_vflag & INP_IPV6; len = 0; if (EVL_VLANOFTAG(vtag) == 0xfff) { struct ether_header *eh = (void *)pkt; if (ipv6) eh->ether_type = htons(ETHERTYPE_IPV6); else eh->ether_type = htons(ETHERTYPE_IP); len += sizeof(*eh); } else { struct ether_vlan_header *evh = (void *)pkt; evh->evl_encap_proto = htons(ETHERTYPE_VLAN); evh->evl_tag = htons(vtag); if (ipv6) evh->evl_proto = htons(ETHERTYPE_IPV6); else evh->evl_proto = htons(ETHERTYPE_IP); len += sizeof(*evh); } if (ipv6) { struct ip6_hdr *ip6 = (void *)&pkt[len]; ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_plen = htons(sizeof(struct tcphdr)); ip6->ip6_nxt = IPPROTO_TCP; if (open_type == OPEN_TYPE_ACTIVE) { ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = inp->in6p_faddr; } else if (open_type == OPEN_TYPE_LISTEN) { ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = ip6->ip6_src; } len += sizeof(*ip6); } else { struct ip *ip = (void *)&pkt[len]; ip->ip_v = IPVERSION; ip->ip_hl = sizeof(*ip) >> 2; ip->ip_tos = inp->inp_ip_tos; ip->ip_len = htons(sizeof(struct ip) + sizeof(struct tcphdr)); ip->ip_ttl = inp->inp_ip_ttl; ip->ip_p = IPPROTO_TCP; if (open_type == OPEN_TYPE_ACTIVE) { ip->ip_src = inp->inp_laddr; ip->ip_dst = inp->inp_faddr; } else if (open_type == OPEN_TYPE_LISTEN) { ip->ip_src = inp->inp_laddr; ip->ip_dst = ip->ip_src; } len += sizeof(*ip); } th = (void *)&pkt[len]; if (open_type == OPEN_TYPE_ACTIVE) { th->th_sport = inp->inp_lport; /* network byte order already */ th->th_dport = inp->inp_fport; /* ditto */ } else if (open_type == OPEN_TYPE_LISTEN) { th->th_sport = inp->inp_lport; /* network byte order already */ th->th_dport = th->th_sport; } len += sizeof(th); *pktlen = *buflen = len; return (pkt); } const struct offload_settings * lookup_offload_policy(struct adapter *sc, int open_type, struct mbuf *m, uint16_t vtag, struct inpcb *inp) { const struct t4_offload_policy *op; char *pkt; struct offload_rule *r; int i, matched, pktlen, buflen; static const struct offload_settings allow_offloading_settings = { .offload = 1, .rx_coalesce = -1, .cong_algo = -1, .sched_class = -1, .tstamp = -1, .sack = -1, .nagle = -1, .ecn = -1, .ddp = -1, .tls = -1, .txq = QUEUE_RANDOM, .rxq = QUEUE_RANDOM, .mss = -1, }; static const struct offload_settings disallow_offloading_settings = { .offload = 0, /* rest is irrelevant when offload is off. */ }; rw_assert(&sc->policy_lock, RA_LOCKED); /* * If there's no Connection Offloading Policy attached to the device * then we need to return a default static policy. If * "cop_managed_offloading" is true, then we need to disallow * offloading until a COP is attached to the device. Otherwise we * allow offloading ... */ op = sc->policy; if (op == NULL) { if (sc->tt.cop_managed_offloading) return (&disallow_offloading_settings); else return (&allow_offloading_settings); } switch (open_type) { case OPEN_TYPE_ACTIVE: case OPEN_TYPE_LISTEN: pkt = prepare_pkt(open_type, vtag, inp, &pktlen, &buflen); break; case OPEN_TYPE_PASSIVE: MPASS(m != NULL); pkt = mtod(m, char *); MPASS(*pkt == CPL_PASS_ACCEPT_REQ); pkt += sizeof(struct cpl_pass_accept_req); pktlen = m->m_pkthdr.len - sizeof(struct cpl_pass_accept_req); buflen = m->m_len - sizeof(struct cpl_pass_accept_req); break; default: MPASS(0); return (&disallow_offloading_settings); } if (pkt == NULL || pktlen == 0 || buflen == 0) return (&disallow_offloading_settings); matched = 0; r = &op->rule[0]; for (i = 0; i < op->nrules; i++, r++) { if (r->open_type != open_type && r->open_type != OPEN_TYPE_DONTCARE) { continue; } matched = bpf_filter(r->bpf_prog.bf_insns, pkt, pktlen, buflen); if (matched) break; } if (open_type == OPEN_TYPE_ACTIVE || open_type == OPEN_TYPE_LISTEN) free(pkt, M_CXGBE); return (matched ? &r->settings : &disallow_offloading_settings); } static void reclaim_wr_resources(void *arg, int count) { struct tom_data *td = arg; STAILQ_HEAD(, wrqe) twr_list = STAILQ_HEAD_INITIALIZER(twr_list); struct cpl_act_open_req *cpl; u_int opcode, atid, tid; struct wrqe *wr; struct adapter *sc = td_adapter(td); mtx_lock(&td->unsent_wr_lock); STAILQ_SWAP(&td->unsent_wr_list, &twr_list, wrqe); mtx_unlock(&td->unsent_wr_lock); while ((wr = STAILQ_FIRST(&twr_list)) != NULL) { STAILQ_REMOVE_HEAD(&twr_list, link); cpl = wrtod(wr); opcode = GET_OPCODE(cpl); switch (opcode) { case CPL_ACT_OPEN_REQ: case CPL_ACT_OPEN_REQ6: atid = G_TID_TID(be32toh(OPCODE_TID(cpl))); CTR2(KTR_CXGBE, "%s: atid %u ", __func__, atid); act_open_failure_cleanup(sc, lookup_atid(sc, atid), EHOSTUNREACH); free(wr, M_CXGBE); break; case CPL_PASS_ACCEPT_RPL: tid = GET_TID(cpl); CTR2(KTR_CXGBE, "%s: tid %u ", __func__, tid); synack_failure_cleanup(sc, lookup_tid(sc, tid)); free(wr, M_CXGBE); break; default: log(LOG_ERR, "%s: leaked work request %p, wr_len %d, " "opcode %x\n", __func__, wr, wr->wr_len, opcode); /* WR not freed here; go look at it with a debugger. */ } } } /* * Based on do_abort_req. We treat an abrupt hardware stop as a connection * abort from the hardware. */ static void live_tid_failure_cleanup(struct adapter *sc, struct toepcb *toep, u_int status) { struct inpcb *inp; struct tcpcb *tp; struct epoch_tracker et; MPASS(!(toep->flags & TPF_SYNQE)); inp = toep->inp; CURVNET_SET(toep->vnet); NET_EPOCH_ENTER(et); /* for tcp_close */ INP_WLOCK(inp); tp = intotcpcb(inp); toep->flags |= TPF_ABORT_SHUTDOWN; if ((inp->inp_flags & INP_DROPPED) == 0) { struct socket *so = inp->inp_socket; if (so != NULL) so_error_set(so, status); tp = tcp_close(tp); if (tp == NULL) INP_WLOCK(inp); /* re-acquire */ } final_cpl_received(toep); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); } static void cleanup_stranded_tids(void *arg, int count) { TAILQ_HEAD(, toepcb) tlist = TAILQ_HEAD_INITIALIZER(tlist); TAILQ_HEAD(, synq_entry) slist = TAILQ_HEAD_INITIALIZER(slist); struct tom_data *td = arg; struct adapter *sc = td_adapter(td); struct toepcb *toep; struct synq_entry *synqe; /* Clean up synq entries. */ mtx_lock(&td->toep_list_lock); TAILQ_SWAP(&td->stranded_synqe, &slist, synq_entry, link); mtx_unlock(&td->toep_list_lock); while ((synqe = TAILQ_FIRST(&slist)) != NULL) { TAILQ_REMOVE(&slist, synqe, link); MPASS(synqe->tid >= 0); /* stale, was kept around for debug */ synqe->tid = -1; synack_failure_cleanup(sc, synqe); } /* Clean up in-flight active opens. */ mtx_lock(&td->toep_list_lock); TAILQ_SWAP(&td->stranded_atids, &tlist, toepcb, link); mtx_unlock(&td->toep_list_lock); while ((toep = TAILQ_FIRST(&tlist)) != NULL) { TAILQ_REMOVE(&tlist, toep, link); MPASS(toep->tid >= 0); /* stale, was kept around for debug */ toep->tid = -1; act_open_failure_cleanup(sc, toep, EHOSTUNREACH); } /* Clean up live connections. */ mtx_lock(&td->toep_list_lock); TAILQ_SWAP(&td->stranded_tids, &tlist, toepcb, link); mtx_unlock(&td->toep_list_lock); while ((toep = TAILQ_FIRST(&tlist)) != NULL) { TAILQ_REMOVE(&tlist, toep, link); MPASS(toep->tid >= 0); /* stale, was kept around for debug */ toep->tid = -1; live_tid_failure_cleanup(sc, toep, ECONNABORTED); } } /* * Ground control to Major TOM * Commencing countdown, engines on */ static int t4_tom_activate(struct adapter *sc) { struct tom_data *td; struct toedev *tod; struct vi_info *vi; int i, rc, v; ASSERT_SYNCHRONIZED_OP(sc); /* per-adapter softc for TOM */ td = malloc(sizeof(*td), M_CXGBE, M_ZERO | M_NOWAIT); if (td == NULL) return (ENOMEM); /* List of TOE PCBs and associated lock */ mtx_init(&td->toep_list_lock, "PCB list lock", NULL, MTX_DEF); TAILQ_INIT(&td->toep_list); TAILQ_INIT(&td->synqe_list); TAILQ_INIT(&td->stranded_atids); TAILQ_INIT(&td->stranded_tids); TASK_INIT(&td->cleanup_stranded_tids, 0, cleanup_stranded_tids, td); /* Listen context */ mtx_init(&td->lctx_hash_lock, "lctx hash lock", NULL, MTX_DEF); td->listen_hash = hashinit_flags(LISTEN_HASH_SIZE, M_CXGBE, &td->listen_mask, HASH_NOWAIT); /* List of WRs for which L2 resolution failed */ mtx_init(&td->unsent_wr_lock, "Unsent WR list lock", NULL, MTX_DEF); STAILQ_INIT(&td->unsent_wr_list); TASK_INIT(&td->reclaim_wr_resources, 0, reclaim_wr_resources, td); /* TID tables */ rc = alloc_tid_tabs(sc); if (rc != 0) goto done; rc = t4_init_ppod_region(&td->pr, &sc->vres.ddp, t4_read_reg(sc, A_ULP_RX_TDDP_PSZ), "TDDP page pods"); if (rc != 0) goto done; t4_set_reg_field(sc, A_ULP_RX_TDDP_TAGMASK, V_TDDPTAGMASK(M_TDDPTAGMASK), td->pr.pr_tag_mask); alloc_tcb_history(sc, td); /* toedev ops */ tod = &td->tod; init_toedev(tod); tod->tod_softc = sc; tod->tod_connect = t4_connect; tod->tod_listen_start = t4_listen_start; tod->tod_listen_stop = t4_listen_stop; tod->tod_rcvd = t4_rcvd; tod->tod_output = t4_tod_output; tod->tod_send_rst = t4_send_rst; tod->tod_send_fin = t4_send_fin; tod->tod_pcb_detach = t4_pcb_detach; tod->tod_l2_update = t4_l2_update; tod->tod_syncache_added = t4_syncache_added; tod->tod_syncache_removed = t4_syncache_removed; tod->tod_syncache_respond = t4_syncache_respond; tod->tod_offload_socket = t4_offload_socket; tod->tod_ctloutput = t4_ctloutput; tod->tod_tcp_info = t4_tcp_info; #ifdef KERN_TLS tod->tod_alloc_tls_session = t4_alloc_tls_session; #endif tod->tod_pmtu_update = t4_pmtu_update; for_each_port(sc, i) { for_each_vi(sc->port[i], v, vi) { SETTOEDEV(vi->ifp, &td->tod); } } sc->tom_softc = td; register_toedev(sc->tom_softc); done: if (rc != 0) free_tom_data(sc, td); return (rc); } static int t4_tom_deactivate(struct adapter *sc) { int rc = 0; struct tom_data *td = sc->tom_softc; ASSERT_SYNCHRONIZED_OP(sc); if (td == NULL) return (0); /* XXX. KASSERT? */ if (sc->offload_map != 0) return (EBUSY); /* at least one port has IFCAP_TOE enabled */ if (uld_active(sc, ULD_IWARP) || uld_active(sc, ULD_ISCSI)) return (EBUSY); /* both iWARP and iSCSI rely on the TOE. */ mtx_lock(&td->toep_list_lock); if (!TAILQ_EMPTY(&td->toep_list)) rc = EBUSY; MPASS(TAILQ_EMPTY(&td->synqe_list)); MPASS(TAILQ_EMPTY(&td->stranded_tids)); mtx_unlock(&td->toep_list_lock); mtx_unlock(&td->toep_list_lock); mtx_lock(&td->lctx_hash_lock); if (td->lctx_count > 0) rc = EBUSY; mtx_unlock(&td->lctx_hash_lock); taskqueue_drain(taskqueue_thread, &td->reclaim_wr_resources); taskqueue_drain(taskqueue_thread, &td->cleanup_stranded_tids); mtx_lock(&td->unsent_wr_lock); if (!STAILQ_EMPTY(&td->unsent_wr_list)) rc = EBUSY; mtx_unlock(&td->unsent_wr_lock); if (rc == 0) { unregister_toedev(sc->tom_softc); free_tom_data(sc, td); sc->tom_softc = NULL; } return (rc); } static void stop_atids(struct adapter *sc) { struct tom_data *td = sc->tom_softc; struct tid_info *t = &sc->tids; struct toepcb *toep; int atid; /* * Hashfilters and T6-KTLS are the only other users of atids but they're * both mutually exclusive with TOE. That means t4_tom owns all the * atids in the table. */ MPASS(!is_hashfilter(sc)); if (is_t6(sc)) MPASS(!(sc->flags & KERN_TLS_ON)); /* New atids are not being allocated. */ #ifdef INVARIANTS mtx_lock(&t->atid_lock); MPASS(t->atid_alloc_stopped == true); mtx_unlock(&t->atid_lock); #endif /* * In-use atids fall in one of these two categories: * a) Those waiting for L2 resolution before being submitted to * hardware. * b) Those that have been submitted to hardware and are awaiting * replies that will never arrive because the LLD is stopped. */ for (atid = 0; atid < t->natids; atid++) { toep = lookup_atid(sc, atid); if ((uintptr_t)toep >= (uintptr_t)&t->atid_tab[0] && (uintptr_t)toep < (uintptr_t)&t->atid_tab[t->natids]) continue; + if (__predict_false(toep == NULL)) + continue; MPASS(toep->tid == atid); MPASS(toep->incarnation == sc->incarnation); /* * Take the atid out of the lookup table. toep->tid is stale * after this but useful for debug. */ CTR(KTR_CXGBE, "%s: atid %d@%d STRANDED, removed from table", __func__, atid, toep->incarnation); free_atid(sc, toep->tid); #if 0 toep->tid = -1; #endif mtx_lock(&td->toep_list_lock); + toep->flags &= ~TPF_IN_TOEP_LIST; + TAILQ_REMOVE(&td->toep_list, toep, link); TAILQ_INSERT_TAIL(&td->stranded_atids, toep, link); mtx_unlock(&td->toep_list_lock); } MPASS(atomic_load_int(&t->atids_in_use) == 0); } static void stop_tids(struct adapter *sc) { struct tom_data *td = sc->tom_softc; struct toepcb *toep; #ifdef INVARIANTS struct tid_info *t = &sc->tids; #endif /* * The LLD's offload queues are stopped so do_act_establish and * do_pass_accept_req cannot run and insert tids in parallel with this * thread. stop_stid_tab has also run and removed the synq entries' * tids from the table. The only tids in the table are for connections * at or beyond ESTABLISHED that are still waiting for the final CPL. */ mtx_lock(&td->toep_list_lock); TAILQ_FOREACH(toep, &td->toep_list, link) { MPASS(sc->incarnation == toep->incarnation); MPASS(toep->tid >= 0); MPASS(toep == lookup_tid(sc, toep->tid)); /* Remove tid from the lookup table immediately. */ CTR(KTR_CXGBE, "%s: tid %d@%d STRANDED, removed from table", __func__, toep->tid, toep->incarnation); remove_tid(sc, toep->tid, toep->ce ? 2 : 1); #if 0 /* toep->tid is stale now but left alone for debug. */ toep->tid = -1; #endif /* All toep in this list will get bulk moved to stranded_tids */ toep->flags &= ~TPF_IN_TOEP_LIST; } MPASS(TAILQ_EMPTY(&td->stranded_tids)); TAILQ_CONCAT(&td->stranded_tids, &td->toep_list, link); MPASS(TAILQ_EMPTY(&td->toep_list)); mtx_unlock(&td->toep_list_lock); MPASS(atomic_load_int(&t->tids_in_use) == 0); } /* * L2T is stable because * 1. stop_lld stopped all new allocations. * 2. stop_lld also stopped the tx wrq so nothing is enqueueing new WRs to the * queue or to l2t_entry->wr_list. * 3. t4_l2t_update is ignoring all L2 updates. */ static void stop_tom_l2t(struct adapter *sc) { struct l2t_data *d = sc->l2t; + struct tom_data *td = sc->tom_softc; struct l2t_entry *e; + struct wrqe *wr; int i; + /* + * This task cannot be enqueued because L2 state changes are not being + * processed. But if it's already scheduled or running then we need to + * wait for it to cleanup the atids in the unsent_wr_list. + */ + taskqueue_drain(taskqueue_thread, &td->reclaim_wr_resources); + MPASS(STAILQ_EMPTY(&td->unsent_wr_list)); + for (i = 0; i < d->l2t_size; i++) { e = &d->l2tab[i]; mtx_lock(&e->lock); if (e->state == L2T_STATE_VALID) e->state = L2T_STATE_RESOLVING; - if (!STAILQ_EMPTY(&e->wr_list)) - CXGBE_UNIMPLEMENTED("l2t e->wr_list"); + /* + * stop_atids is going to clean up _all_ atids in use, including + * these that were pending L2 resolution. Just discard the WRs. + */ + while ((wr = STAILQ_FIRST(&e->wr_list)) != NULL) { + STAILQ_REMOVE_HEAD(&e->wr_list, link); + free(wr, M_CXGBE); + } mtx_unlock(&e->lock); } } static int t4_tom_stop(struct adapter *sc) { struct tid_info *t = &sc->tids; struct tom_data *td = sc->tom_softc; ASSERT_SYNCHRONIZED_OP(sc); stop_tom_l2t(sc); if (atomic_load_int(&t->atids_in_use) > 0) stop_atids(sc); if (atomic_load_int(&t->stids_in_use) > 0) stop_stid_tab(sc); if (atomic_load_int(&t->tids_in_use) > 0) stop_tids(sc); taskqueue_enqueue(taskqueue_thread, &td->cleanup_stranded_tids); + /* + * L2T and atid_tab are restarted before t4_tom_restart so this assert + * is not valid in t4_tom_restart. This is the next best place for it. + */ + MPASS(STAILQ_EMPTY(&td->unsent_wr_list)); + return (0); } static int t4_tom_restart(struct adapter *sc) { ASSERT_SYNCHRONIZED_OP(sc); restart_stid_tab(sc); return (0); } static int t4_ctloutput_tom(struct socket *so, struct sockopt *sopt) { struct tcpcb *tp = sototcpcb(so); struct toepcb *toep = tp->t_toe; int error, optval; if (sopt->sopt_level == IPPROTO_TCP && sopt->sopt_name == TCP_USE_DDP) { if (sopt->sopt_dir != SOPT_SET) return (EOPNOTSUPP); if (sopt->sopt_td != NULL) { /* Only settable by the kernel. */ return (EPERM); } error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) return (error); if (optval != 0) return (t4_enable_ddp_rcv(so, toep)); else return (EOPNOTSUPP); } return (tcp_ctloutput(so, sopt)); } static int t4_aio_queue_tom(struct socket *so, struct kaiocb *job) { struct tcpcb *tp = sototcpcb(so); struct toepcb *toep = tp->t_toe; int error; /* * No lock is needed as TOE sockets never change between * active and passive. */ if (SOLISTENING(so)) return (EINVAL); if (ulp_mode(toep) == ULP_MODE_TCPDDP || ulp_mode(toep) == ULP_MODE_NONE) { error = t4_aio_queue_ddp(so, job); if (error != EOPNOTSUPP) return (error); } return (t4_aio_queue_aiotx(so, job)); } static int t4_tom_mod_load(void) { /* CPL handlers */ t4_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl); t4_register_shared_cpl_handler(CPL_L2T_WRITE_RPL, do_l2t_write_rpl2, CPL_COOKIE_TOM); t4_init_connect_cpl_handlers(); t4_init_listen_cpl_handlers(); t4_init_cpl_io_handlers(); t4_ddp_mod_load(); t4_tls_mod_load(); bcopy(&tcp_protosw, &toe_protosw, sizeof(toe_protosw)); toe_protosw.pr_ctloutput = t4_ctloutput_tom; toe_protosw.pr_aio_queue = t4_aio_queue_tom; bcopy(&tcp6_protosw, &toe6_protosw, sizeof(toe6_protosw)); toe6_protosw.pr_ctloutput = t4_ctloutput_tom; toe6_protosw.pr_aio_queue = t4_aio_queue_tom; return (t4_register_uld(&tom_uld_info, ULD_TOM)); } static void tom_uninit(struct adapter *sc, void *arg __unused) { if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4tomun")) return; /* Try to free resources (works only if no port has IFCAP_TOE) */ if (uld_active(sc, ULD_TOM)) t4_deactivate_uld(sc, ULD_TOM); end_synchronized_op(sc, 0); } static int t4_tom_mod_unload(void) { t4_iterate(tom_uninit, NULL); if (t4_unregister_uld(&tom_uld_info, ULD_TOM) == EBUSY) return (EBUSY); t4_tls_mod_unload(); t4_ddp_mod_unload(); t4_uninit_connect_cpl_handlers(); t4_uninit_listen_cpl_handlers(); t4_uninit_cpl_io_handlers(); t4_register_shared_cpl_handler(CPL_L2T_WRITE_RPL, NULL, CPL_COOKIE_TOM); t4_register_cpl_handler(CPL_GET_TCB_RPL, NULL); return (0); } #endif /* TCP_OFFLOAD */ static int t4_tom_modevent(module_t mod, int cmd, void *arg) { int rc = 0; #ifdef TCP_OFFLOAD switch (cmd) { case MOD_LOAD: rc = t4_tom_mod_load(); break; case MOD_UNLOAD: rc = t4_tom_mod_unload(); break; default: rc = EINVAL; } #else printf("t4_tom: compiled without TCP_OFFLOAD support.\n"); rc = EOPNOTSUPP; #endif return (rc); } static moduledata_t t4_tom_moddata= { "t4_tom", t4_tom_modevent, 0 }; MODULE_VERSION(t4_tom, 1); MODULE_DEPEND(t4_tom, toecore, 1, 1, 1); MODULE_DEPEND(t4_tom, t4nex, 1, 1, 1); DECLARE_MODULE(t4_tom, t4_tom_moddata, SI_SUB_EXEC, SI_ORDER_ANY);