Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F140056604
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
146 KB
Referenced Files
None
Subscribers
None
View Options
diff --git a/sys/dev/cxgbe/offload.h b/sys/dev/cxgbe/offload.h
index b17056dc0338..c13901ee1a59 100644
--- a/sys/dev/cxgbe/offload.h
+++ b/sys/dev/cxgbe/offload.h
@@ -1,250 +1,251 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2010 Chelsio Communications, Inc.
* All rights reserved.
* Written by: Navdeep Parhar <np@FreeBSD.org>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
#ifndef __T4_OFFLOAD_H__
#define __T4_OFFLOAD_H__
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/condvar.h>
#define INIT_ULPTX_WRH(w, wrlen, atomic, tid) do { \
(w)->wr_hi = htonl(V_FW_WR_OP(FW_ULPTX_WR) | V_FW_WR_ATOMIC(atomic)); \
(w)->wr_mid = htonl(V_FW_WR_LEN16(DIV_ROUND_UP(wrlen, 16)) | \
V_FW_WR_FLOWID(tid)); \
(w)->wr_lo = cpu_to_be64(0); \
} while (0)
#define INIT_ULPTX_WR(w, wrlen, atomic, tid) \
INIT_ULPTX_WRH(&((w)->wr), wrlen, atomic, tid)
#define INIT_TP_WR(w, tid) do { \
(w)->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) | \
V_FW_WR_IMMDLEN(sizeof(*w) - sizeof(w->wr))); \
(w)->wr.wr_mid = htonl(V_FW_WR_LEN16(DIV_ROUND_UP(sizeof(*w), 16)) | \
V_FW_WR_FLOWID(tid)); \
(w)->wr.wr_lo = cpu_to_be64(0); \
} while (0)
#define INIT_TP_WR_MIT_CPL(w, cpl, tid) do { \
INIT_TP_WR(w, tid); \
OPCODE_TID(w) = htonl(MK_OPCODE_TID(cpl, tid)); \
} while (0)
TAILQ_HEAD(stid_head, stid_region);
struct listen_ctx;
struct stid_region {
TAILQ_ENTRY(stid_region) link;
u_int used; /* # of stids used by this region */
u_int free; /* # of contiguous stids free right after this region */
};
/*
* Max # of ATIDs. The absolute HW max is larger than this but we reserve a few
* of the upper bits for use as a cookie to demux the reply.
*/
#define MAX_ATIDS (M_TID_TID + 1)
union aopen_entry {
void *data;
union aopen_entry *next;
};
/* cxgbe_rate_tag flags */
enum {
EO_FLOWC_PENDING = (1 << 0), /* flowc needs to be sent */
EO_FLOWC_RPL_PENDING = (1 << 1), /* flowc credits due back */
EO_SND_TAG_REF = (1 << 2), /* kernel has a ref on us */
EO_FLUSH_RPL_PENDING = (1 << 3), /* credit flush rpl due back */
};
struct cxgbe_rate_tag {
struct m_snd_tag com;
struct adapter *adapter;
u_int flags;
struct mtx lock;
int port_id;
int etid;
struct mbufq pending_tx, pending_fwack;
int plen;
struct sge_ofld_txq *eo_txq;
uint32_t ctrl0;
uint16_t iqid;
int8_t schedcl;
uint64_t max_rate; /* in bytes/s */
uint8_t tx_total; /* total tx WR credits (in 16B units) */
uint8_t tx_credits; /* tx WR credits (in 16B units) available */
uint8_t tx_nocompl; /* tx WR credits since last compl request */
uint8_t ncompl; /* # of completions outstanding. */
};
static inline struct cxgbe_rate_tag *
mst_to_crt(struct m_snd_tag *t)
{
return (__containerof(t, struct cxgbe_rate_tag, com));
}
union etid_entry {
struct cxgbe_rate_tag *cst;
union etid_entry *next;
};
/*
* Holds the size, base address, start, end, etc. of various types of TIDs. The
* tables themselves are allocated dynamically.
*/
struct tid_info {
u_int nstids;
u_int stid_base;
u_int natids;
u_int nftids;
u_int ftid_base;
u_int ftid_end;
u_int nhpftids;
u_int hpftid_base;
u_int hpftid_end;
u_int ntids;
u_int tid_base;
u_int netids;
u_int etid_base;
u_int etid_end;
struct mtx stid_lock __aligned(CACHE_LINE_SIZE);
struct listen_ctx **stid_tab;
u_int stids_in_use;
u_int nstids_free_head; /* # of available stids at the beginning */
struct stid_head stids;
+ bool stid_tab_stopped;
struct mtx atid_lock __aligned(CACHE_LINE_SIZE);
union aopen_entry *atid_tab;
union aopen_entry *afree;
u_int atids_in_use;
bool atid_alloc_stopped;
/* High priority filters and normal filters share the lock and cv. */
struct mtx ftid_lock __aligned(CACHE_LINE_SIZE);
struct cv ftid_cv;
struct filter_entry *ftid_tab;
struct filter_entry *hpftid_tab;
u_int ftids_in_use;
u_int hpftids_in_use;
/*
* hashfilter and TOE are mutually exclusive and both use ntids and
* tids_in_use. The lock and cv are used only by hashfilter.
*/
struct mtx hftid_lock __aligned(CACHE_LINE_SIZE);
struct cv hftid_cv;
void **tid_tab;
u_int tids_in_use;
void *hftid_hash_4t; /* LIST_HEAD(, filter_entry) *hftid_hash_4t; */
u_long hftid_4t_mask;
void *hftid_hash_tid; /* LIST_HEAD(, filter_entry) *hftid_hash_tid; */
u_long hftid_tid_mask;
struct mtx etid_lock __aligned(CACHE_LINE_SIZE);
union etid_entry *etid_tab;
union etid_entry *efree;
u_int etids_in_use;
};
struct t4_range {
u_int start;
u_int size;
};
struct t4_virt_res { /* virtualized HW resources */
struct t4_range ddp;
struct t4_range iscsi;
struct t4_range stag;
struct t4_range rq;
struct t4_range pbl;
struct t4_range qp;
struct t4_range cq;
struct t4_range srq;
struct t4_range ocq;
struct t4_range l2t;
struct t4_range key;
};
enum {
ULD_TOM = 0,
ULD_IWARP,
ULD_ISCSI,
ULD_MAX = ULD_ISCSI
};
struct adapter;
struct port_info;
struct uld_info {
int (*uld_activate)(struct adapter *);
int (*uld_deactivate)(struct adapter *);
int (*uld_stop)(struct adapter *);
int (*uld_restart)(struct adapter *);
};
struct tom_tunables {
int cong_algorithm;
int sndbuf;
int ddp;
int rx_coalesce;
int tls;
int tx_align;
int tx_zcopy;
int cop_managed_offloading;
int autorcvbuf_inc;
int update_hc_on_pmtu_change;
int iso;
};
/* iWARP driver tunables */
struct iw_tunables {
int wc_en;
};
struct tls_tunables {
int inline_keys;
int combo_wrs;
};
#ifdef TCP_OFFLOAD
int t4_register_uld(struct uld_info *, int);
int t4_unregister_uld(struct uld_info *, int);
int t4_activate_uld(struct adapter *, int);
int t4_deactivate_uld(struct adapter *, int);
int uld_active(struct adapter *, int);
#endif
#endif
diff --git a/sys/dev/cxgbe/tom/t4_connect.c b/sys/dev/cxgbe/tom/t4_connect.c
index 4e81f23dc267..1c98e70a4df5 100644
--- a/sys/dev/cxgbe/tom/t4_connect.c
+++ b/sys/dev/cxgbe/tom/t4_connect.c
@@ -1,412 +1,413 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2012 Chelsio Communications, Inc.
* All rights reserved.
* Written by: Navdeep Parhar <np@FreeBSD.org>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
#include "opt_inet.h"
#include "opt_inet6.h"
#ifdef TCP_OFFLOAD
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/module.h>
#include <sys/protosw.h>
#include <sys/domain.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <net/ethernet.h>
#include <net/if.h>
#include <net/if_types.h>
#include <net/if_vlan_var.h>
#include <net/route.h>
#include <net/route/nhop.h>
#include <netinet/in.h>
#include <netinet/in_pcb.h>
#include <netinet/ip.h>
#define TCPSTATES
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_var.h>
#include <netinet/toecore.h>
#include <netinet/cc/cc.h>
#include "common/common.h"
#include "common/t4_msg.h"
#include "common/t4_regs.h"
#include "common/t4_regs_values.h"
#include "t4_clip.h"
#include "tom/t4_tom_l2t.h"
#include "tom/t4_tom.h"
/*
* Active open succeeded.
*/
static int
do_act_establish(struct sge_iq *iq, const struct rss_header *rss,
struct mbuf *m)
{
struct adapter *sc = iq->adapter;
const struct cpl_act_establish *cpl = (const void *)(rss + 1);
u_int tid = GET_TID(cpl);
u_int atid = G_TID_TID(ntohl(cpl->tos_atid));
struct toepcb *toep = lookup_atid(sc, atid);
struct inpcb *inp = toep->inp;
KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
KASSERT(toep->tid == atid, ("%s: toep tid/atid mismatch", __func__));
CTR3(KTR_CXGBE, "%s: atid %u, tid %u", __func__, atid, tid);
free_atid(sc, atid);
CURVNET_SET(toep->vnet);
INP_WLOCK(inp);
toep->tid = tid;
insert_tid(sc, tid, toep, inp->inp_vflag & INP_IPV6 ? 2 : 1);
if (inp->inp_flags & INP_DROPPED) {
/* socket closed by the kernel before hw told us it connected */
send_flowc_wr(toep, NULL);
send_reset(sc, toep, be32toh(cpl->snd_isn));
goto done;
}
make_established(toep, be32toh(cpl->snd_isn) - 1,
be32toh(cpl->rcv_isn) - 1, cpl->tcp_opt);
inp->inp_flowtype = M_HASHTYPE_OPAQUE;
inp->inp_flowid = tid;
done:
INP_WUNLOCK(inp);
CURVNET_RESTORE();
return (0);
}
void
-act_open_failure_cleanup(struct adapter *sc, u_int atid, u_int status)
+act_open_failure_cleanup(struct adapter *sc, struct toepcb *toep, u_int status)
{
- struct toepcb *toep = lookup_atid(sc, atid);
struct inpcb *inp = toep->inp;
struct toedev *tod = &toep->td->tod;
struct epoch_tracker et;
- free_atid(sc, atid);
- toep->tid = -1;
+ if (toep->tid >= 0) {
+ free_atid(sc, toep->tid);
+ toep->tid = -1;
+ }
CURVNET_SET(toep->vnet);
if (status != EAGAIN)
NET_EPOCH_ENTER(et);
INP_WLOCK(inp);
toe_connect_failed(tod, inp, status);
final_cpl_received(toep); /* unlocks inp */
if (status != EAGAIN)
NET_EPOCH_EXIT(et);
CURVNET_RESTORE();
}
/*
* Active open failed.
*/
static int
do_act_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
struct mbuf *m)
{
struct adapter *sc = iq->adapter;
const struct cpl_act_open_rpl *cpl = (const void *)(rss + 1);
u_int atid = G_TID_TID(G_AOPEN_ATID(be32toh(cpl->atid_status)));
u_int status = G_AOPEN_STATUS(be32toh(cpl->atid_status));
struct toepcb *toep = lookup_atid(sc, atid);
int rc;
KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
KASSERT(toep->tid == atid, ("%s: toep tid/atid mismatch", __func__));
CTR3(KTR_CXGBE, "%s: atid %u, status %u ", __func__, atid, status);
/* Ignore negative advice */
if (negative_advice(status))
return (0);
if (status && act_open_has_tid(status))
release_tid(sc, GET_TID(cpl), toep->ctrlq);
rc = act_open_rpl_status_to_errno(status);
- act_open_failure_cleanup(sc, atid, rc);
+ act_open_failure_cleanup(sc, toep, rc);
return (0);
}
void
t4_init_connect_cpl_handlers(void)
{
t4_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
t4_register_shared_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl,
CPL_COOKIE_TOM);
}
void
t4_uninit_connect_cpl_handlers(void)
{
t4_register_cpl_handler(CPL_ACT_ESTABLISH, NULL);
t4_register_shared_cpl_handler(CPL_ACT_OPEN_RPL, NULL, CPL_COOKIE_TOM);
}
#ifdef KTR
#define DONT_OFFLOAD_ACTIVE_OPEN(x) do { \
reason = __LINE__; \
rc = (x); \
goto failed; \
} while (0)
#else
#define DONT_OFFLOAD_ACTIVE_OPEN(x) do { \
rc = (x); \
goto failed; \
} while (0)
#endif
static inline int
act_open_cpl_size(struct adapter *sc, int isipv6)
{
int idx;
static const int sz_table[3][2] = {
{
sizeof (struct cpl_act_open_req),
sizeof (struct cpl_act_open_req6)
},
{
sizeof (struct cpl_t5_act_open_req),
sizeof (struct cpl_t5_act_open_req6)
},
{
sizeof (struct cpl_t6_act_open_req),
sizeof (struct cpl_t6_act_open_req6)
},
};
MPASS(chip_id(sc) >= CHELSIO_T4);
idx = min(chip_id(sc) - CHELSIO_T4, 2);
return (sz_table[idx][!!isipv6]);
}
/*
* active open (soconnect).
*
* State of affairs on entry:
* soisconnecting (so_state |= SS_ISCONNECTING)
* tcbinfo not locked (This has changed - used to be WLOCKed)
* inp WLOCKed
* tp->t_state = TCPS_SYN_SENT
* rtalloc1, RT_UNLOCK on rt.
*/
int
t4_connect(struct toedev *tod, struct socket *so, struct nhop_object *nh,
struct sockaddr *nam)
{
struct adapter *sc = tod->tod_softc;
struct toepcb *toep = NULL;
struct wrqe *wr = NULL;
if_t rt_ifp = nh->nh_ifp;
struct vi_info *vi;
int qid_atid, rc, isipv6;
struct inpcb *inp = sotoinpcb(so);
struct tcpcb *tp = intotcpcb(inp);
#ifdef KTR
int reason;
#endif
struct offload_settings settings;
struct epoch_tracker et;
uint16_t vid = 0xfff, pcp = 0;
INP_WLOCK_ASSERT(inp);
KASSERT(nam->sa_family == AF_INET || nam->sa_family == AF_INET6,
("%s: dest addr %p has family %u", __func__, nam, nam->sa_family));
if (if_gettype(rt_ifp) == IFT_ETHER)
vi = if_getsoftc(rt_ifp);
else if (if_gettype(rt_ifp) == IFT_L2VLAN) {
if_t ifp = VLAN_TRUNKDEV(rt_ifp);
vi = if_getsoftc(ifp);
VLAN_TAG(rt_ifp, &vid);
VLAN_PCP(rt_ifp, &pcp);
} else if (if_gettype(rt_ifp) == IFT_IEEE8023ADLAG)
DONT_OFFLOAD_ACTIVE_OPEN(ENOSYS); /* XXX: implement lagg+TOE */
else
DONT_OFFLOAD_ACTIVE_OPEN(ENOTSUP);
if (sc->flags & KERN_TLS_ON)
DONT_OFFLOAD_ACTIVE_OPEN(ENOTSUP);
rw_rlock(&sc->policy_lock);
settings = *lookup_offload_policy(sc, OPEN_TYPE_ACTIVE, NULL,
EVL_MAKETAG(vid, pcp, 0), inp);
rw_runlock(&sc->policy_lock);
if (!settings.offload)
DONT_OFFLOAD_ACTIVE_OPEN(EPERM);
toep = alloc_toepcb(vi, M_NOWAIT);
if (toep == NULL)
DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM);
toep->tid = alloc_atid(sc, toep);
if (toep->tid < 0)
DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM);
toep->l2te = t4_l2t_get(vi->pi, rt_ifp,
nh->nh_flags & NHF_GATEWAY ? &nh->gw_sa : nam);
if (toep->l2te == NULL)
DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM);
toep->vnet = so->so_vnet;
init_conn_params(vi, &settings, &inp->inp_inc, so, NULL,
toep->l2te->idx, &toep->params);
init_toepcb(vi, toep);
isipv6 = nam->sa_family == AF_INET6;
wr = alloc_wrqe(act_open_cpl_size(sc, isipv6), toep->ctrlq);
if (wr == NULL)
DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM);
qid_atid = V_TID_QID(toep->ofld_rxq->iq.abs_id) | V_TID_TID(toep->tid) |
V_TID_COOKIE(CPL_COOKIE_TOM);
if (isipv6) {
struct cpl_act_open_req6 *cpl = wrtod(wr);
struct cpl_t5_act_open_req6 *cpl5 = (void *)cpl;
struct cpl_t6_act_open_req6 *cpl6 = (void *)cpl;
if ((inp->inp_vflag & INP_IPV6) == 0)
DONT_OFFLOAD_ACTIVE_OPEN(ENOTSUP);
toep->ce = t4_get_clip_entry(sc, &inp->in6p_laddr, true);
if (toep->ce == NULL)
DONT_OFFLOAD_ACTIVE_OPEN(ENOENT);
switch (chip_id(sc)) {
case CHELSIO_T4:
INIT_TP_WR(cpl, 0);
cpl->params = select_ntuple(vi, toep->l2te);
break;
case CHELSIO_T5:
INIT_TP_WR(cpl5, 0);
cpl5->iss = htobe32(tp->iss);
cpl5->params = select_ntuple(vi, toep->l2te);
break;
case CHELSIO_T6:
default:
INIT_TP_WR(cpl6, 0);
cpl6->iss = htobe32(tp->iss);
cpl6->params = select_ntuple(vi, toep->l2te);
break;
}
OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ6,
qid_atid));
cpl->local_port = inp->inp_lport;
cpl->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0];
cpl->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8];
cpl->peer_port = inp->inp_fport;
cpl->peer_ip_hi = *(uint64_t *)&inp->in6p_faddr.s6_addr[0];
cpl->peer_ip_lo = *(uint64_t *)&inp->in6p_faddr.s6_addr[8];
cpl->opt0 = calc_options0(vi, &toep->params);
cpl->opt2 = calc_options2(vi, &toep->params);
CTR6(KTR_CXGBE,
"%s: atid %u, toep %p, inp %p, opt0 %#016lx, opt2 %#08x",
__func__, toep->tid, toep, inp, be64toh(cpl->opt0),
be32toh(cpl->opt2));
} else {
struct cpl_act_open_req *cpl = wrtod(wr);
struct cpl_t5_act_open_req *cpl5 = (void *)cpl;
struct cpl_t6_act_open_req *cpl6 = (void *)cpl;
switch (chip_id(sc)) {
case CHELSIO_T4:
INIT_TP_WR(cpl, 0);
cpl->params = select_ntuple(vi, toep->l2te);
break;
case CHELSIO_T5:
INIT_TP_WR(cpl5, 0);
cpl5->iss = htobe32(tp->iss);
cpl5->params = select_ntuple(vi, toep->l2te);
break;
case CHELSIO_T6:
default:
INIT_TP_WR(cpl6, 0);
cpl6->iss = htobe32(tp->iss);
cpl6->params = select_ntuple(vi, toep->l2te);
break;
}
OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ,
qid_atid));
inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port,
&cpl->peer_ip, &cpl->peer_port);
cpl->opt0 = calc_options0(vi, &toep->params);
cpl->opt2 = calc_options2(vi, &toep->params);
CTR6(KTR_CXGBE,
"%s: atid %u, toep %p, inp %p, opt0 %#016lx, opt2 %#08x",
__func__, toep->tid, toep, inp, be64toh(cpl->opt0),
be32toh(cpl->opt2));
}
offload_socket(so, toep);
NET_EPOCH_ENTER(et);
rc = t4_l2t_send(sc, wr, toep->l2te);
NET_EPOCH_EXIT(et);
if (rc == 0) {
toep->flags |= TPF_CPL_PENDING;
return (0);
}
undo_offload_socket(so);
#if defined(KTR)
reason = __LINE__;
#endif
failed:
CTR3(KTR_CXGBE, "%s: not offloading (%d), rc %d", __func__, reason, rc);
if (wr)
free_wrqe(wr);
if (toep) {
if (toep->tid >= 0)
free_atid(sc, toep->tid);
if (toep->l2te)
t4_l2t_release(toep->l2te);
if (toep->ce)
t4_release_clip_entry(sc, toep->ce);
free_toepcb(toep);
}
return (rc);
}
#endif
diff --git a/sys/dev/cxgbe/tom/t4_listen.c b/sys/dev/cxgbe/tom/t4_listen.c
index 6e8361734db7..897c5bcaab1e 100644
--- a/sys/dev/cxgbe/tom/t4_listen.c
+++ b/sys/dev/cxgbe/tom/t4_listen.c
@@ -1,1614 +1,1722 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2012 Chelsio Communications, Inc.
* All rights reserved.
* Written by: Navdeep Parhar <np@FreeBSD.org>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
#include "opt_inet.h"
#include "opt_inet6.h"
#ifdef TCP_OFFLOAD
#include <sys/param.h>
#include <sys/types.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/module.h>
#include <sys/protosw.h>
#include <sys/refcount.h>
#include <sys/domain.h>
#include <sys/fnv_hash.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <net/ethernet.h>
#include <net/if.h>
#include <net/if_types.h>
#include <net/if_vlan_var.h>
#include <net/route.h>
#include <net/route/nhop.h>
#include <netinet/in.h>
#include <netinet/in_fib.h>
#include <netinet/in_pcb.h>
#include <netinet/ip.h>
#include <netinet/ip6.h>
#include <netinet6/in6_fib.h>
#include <netinet6/scope6_var.h>
#include <netinet/tcp_timer.h>
#define TCPSTATES
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_var.h>
#include <netinet/toecore.h>
#include <netinet/cc/cc.h>
#include "common/common.h"
#include "common/t4_msg.h"
#include "common/t4_regs.h"
#include "t4_clip.h"
#include "tom/t4_tom_l2t.h"
#include "tom/t4_tom.h"
/* stid services */
static int alloc_stid(struct adapter *, struct listen_ctx *, int);
static struct listen_ctx *lookup_stid(struct adapter *, int);
static void free_stid(struct adapter *, struct listen_ctx *);
/* lctx services */
static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *,
struct vi_info *);
static int free_lctx(struct adapter *, struct listen_ctx *);
static void hold_lctx(struct listen_ctx *);
static void listen_hash_add(struct adapter *, struct listen_ctx *);
static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *);
static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *);
static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *);
static void send_abort_rpl_synqe(struct toedev *, struct synq_entry *, int);
+static int create_server6(struct adapter *, struct listen_ctx *);
+static int create_server(struct adapter *, struct listen_ctx *);
+
+int
+alloc_stid_tab(struct adapter *sc)
+{
+ struct tid_info *t = &sc->tids;
+
+ MPASS(t->nstids > 0);
+ MPASS(t->stid_tab == NULL);
+
+ t->stid_tab = malloc(t->nstids * sizeof(*t->stid_tab), M_CXGBE,
+ M_ZERO | M_NOWAIT);
+ if (t->stid_tab == NULL)
+ return (ENOMEM);
+ mtx_init(&t->stid_lock, "stid lock", NULL, MTX_DEF);
+ t->stids_in_use = 0;
+ TAILQ_INIT(&t->stids);
+ t->nstids_free_head = t->nstids;
+
+ return (0);
+}
+
+void
+free_stid_tab(struct adapter *sc)
+{
+ struct tid_info *t = &sc->tids;
+
+ KASSERT(t->stids_in_use == 0,
+ ("%s: %d tids still in use.", __func__, t->stids_in_use));
+
+ if (mtx_initialized(&t->stid_lock))
+ mtx_destroy(&t->stid_lock);
+ free(t->stid_tab, M_CXGBE);
+ t->stid_tab = NULL;
+}
+
+void
+stop_stid_tab(struct adapter *sc)
+{
+ struct tid_info *t = &sc->tids;
+ struct tom_data *td = sc->tom_softc;
+ struct listen_ctx *lctx;
+ struct synq_entry *synqe;
+ int i, ntids;
+
+ mtx_lock(&t->stid_lock);
+ t->stid_tab_stopped = true;
+ mtx_unlock(&t->stid_lock);
+
+ mtx_lock(&td->lctx_hash_lock);
+ for (i = 0; i <= td->listen_mask; i++) {
+ LIST_FOREACH(lctx, &td->listen_hash[i], link)
+ lctx->flags &= ~(LCTX_RPL_PENDING | LCTX_SETUP_IN_HW);
+ }
+ mtx_unlock(&td->lctx_hash_lock);
+
+ mtx_lock(&td->toep_list_lock);
+ TAILQ_FOREACH(synqe, &td->synqe_list, link) {
+ MPASS(sc->incarnation == synqe->incarnation);
+ MPASS(synqe->tid >= 0);
+ MPASS(synqe == lookup_tid(sc, synqe->tid));
+ /* Remove tid from the lookup table immediately. */
+ CTR(KTR_CXGBE, "%s: tid %d@%d STRANDED, removed from table",
+ __func__, synqe->tid, synqe->incarnation);
+ ntids = synqe->lctx->inp->inp_vflag & INP_IPV6 ? 2 : 1;
+ remove_tid(sc, synqe->tid, ntids);
+#if 0
+ /* synqe->tid is stale now but left alone for debug. */
+ synqe->tid = -1;
+#endif
+ }
+ MPASS(TAILQ_EMPTY(&td->stranded_synqe));
+ TAILQ_CONCAT(&td->stranded_synqe, &td->synqe_list, link);
+ MPASS(TAILQ_EMPTY(&td->synqe_list));
+ mtx_unlock(&td->toep_list_lock);
+}
+
+void
+restart_stid_tab(struct adapter *sc)
+{
+ struct tid_info *t = &sc->tids;
+ struct tom_data *td = sc->tom_softc;
+ struct listen_ctx *lctx;
+ int i;
+
+ mtx_lock(&td->lctx_hash_lock);
+ for (i = 0; i <= td->listen_mask; i++) {
+ LIST_FOREACH(lctx, &td->listen_hash[i], link) {
+ MPASS((lctx->flags & (LCTX_RPL_PENDING | LCTX_SETUP_IN_HW)) == 0);
+ lctx->flags |= LCTX_RPL_PENDING;
+ if (lctx->inp->inp_vflag & INP_IPV6)
+ create_server6(sc, lctx);
+ else
+ create_server(sc, lctx);
+ }
+ }
+ mtx_unlock(&td->lctx_hash_lock);
+
+ mtx_lock(&t->stid_lock);
+ t->stid_tab_stopped = false;
+ mtx_unlock(&t->stid_lock);
+
+}
+
static int
alloc_stid(struct adapter *sc, struct listen_ctx *lctx, int isipv6)
{
struct tid_info *t = &sc->tids;
u_int stid, n, f, mask;
struct stid_region *sr = &lctx->stid_region;
/*
* An IPv6 server needs 2 naturally aligned stids (1 stid = 4 cells) in
* the TCAM. The start of the stid region is properly aligned (the chip
* requires each region to be 128-cell aligned).
*/
n = isipv6 ? 2 : 1;
mask = n - 1;
KASSERT((t->stid_base & mask) == 0 && (t->nstids & mask) == 0,
("%s: stid region (%u, %u) not properly aligned. n = %u",
__func__, t->stid_base, t->nstids, n));
mtx_lock(&t->stid_lock);
- if (n > t->nstids - t->stids_in_use) {
+ if (n > t->nstids - t->stids_in_use || t->stid_tab_stopped) {
mtx_unlock(&t->stid_lock);
return (-1);
}
if (t->nstids_free_head >= n) {
/*
* This allocation will definitely succeed because the region
* starts at a good alignment and we just checked we have enough
* stids free.
*/
f = t->nstids_free_head & mask;
t->nstids_free_head -= n + f;
stid = t->nstids_free_head;
TAILQ_INSERT_HEAD(&t->stids, sr, link);
} else {
struct stid_region *s;
stid = t->nstids_free_head;
TAILQ_FOREACH(s, &t->stids, link) {
stid += s->used + s->free;
f = stid & mask;
if (s->free >= n + f) {
stid -= n + f;
s->free -= n + f;
TAILQ_INSERT_AFTER(&t->stids, s, sr, link);
goto allocated;
}
}
if (__predict_false(stid != t->nstids)) {
panic("%s: stids TAILQ (%p) corrupt."
" At %d instead of %d at the end of the queue.",
__func__, &t->stids, stid, t->nstids);
}
mtx_unlock(&t->stid_lock);
return (-1);
}
allocated:
sr->used = n;
sr->free = f;
t->stids_in_use += n;
t->stid_tab[stid] = lctx;
mtx_unlock(&t->stid_lock);
KASSERT(((stid + t->stid_base) & mask) == 0,
("%s: EDOOFUS.", __func__));
return (stid + t->stid_base);
}
static struct listen_ctx *
lookup_stid(struct adapter *sc, int stid)
{
struct tid_info *t = &sc->tids;
return (t->stid_tab[stid - t->stid_base]);
}
static void
free_stid(struct adapter *sc, struct listen_ctx *lctx)
{
struct tid_info *t = &sc->tids;
struct stid_region *sr = &lctx->stid_region;
struct stid_region *s;
KASSERT(sr->used > 0, ("%s: nonsense free (%d)", __func__, sr->used));
mtx_lock(&t->stid_lock);
s = TAILQ_PREV(sr, stid_head, link);
if (s != NULL)
s->free += sr->used + sr->free;
else
t->nstids_free_head += sr->used + sr->free;
KASSERT(t->stids_in_use >= sr->used,
("%s: stids_in_use (%u) < stids being freed (%u)", __func__,
t->stids_in_use, sr->used));
t->stids_in_use -= sr->used;
TAILQ_REMOVE(&t->stids, sr, link);
mtx_unlock(&t->stid_lock);
}
static struct listen_ctx *
alloc_lctx(struct adapter *sc, struct inpcb *inp, struct vi_info *vi)
{
struct listen_ctx *lctx;
INP_WLOCK_ASSERT(inp);
lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO);
if (lctx == NULL)
return (NULL);
lctx->stid = alloc_stid(sc, lctx, inp->inp_vflag & INP_IPV6);
if (lctx->stid < 0) {
free(lctx, M_CXGBE);
return (NULL);
}
if (inp->inp_vflag & INP_IPV6 &&
!IN6_ARE_ADDR_EQUAL(&in6addr_any, &inp->in6p_laddr)) {
lctx->ce = t4_get_clip_entry(sc, &inp->in6p_laddr, true);
if (lctx->ce == NULL) {
free(lctx, M_CXGBE);
return (NULL);
}
}
lctx->ctrlq = &sc->sge.ctrlq[vi->pi->port_id];
lctx->ofld_rxq = &sc->sge.ofld_rxq[vi->first_ofld_rxq];
refcount_init(&lctx->refcount, 1);
lctx->inp = inp;
lctx->vnet = inp->inp_socket->so_vnet;
in_pcbref(inp);
return (lctx);
}
/* Don't call this directly, use release_lctx instead */
static int
free_lctx(struct adapter *sc, struct listen_ctx *lctx)
{
struct inpcb *inp = lctx->inp;
INP_WLOCK_ASSERT(inp);
KASSERT(lctx->refcount == 0,
("%s: refcount %d", __func__, lctx->refcount));
KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid));
CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p",
__func__, lctx->stid, lctx, lctx->inp);
if (lctx->ce)
t4_release_clip_entry(sc, lctx->ce);
free_stid(sc, lctx);
free(lctx, M_CXGBE);
return (in_pcbrele_wlocked(inp));
}
static void
hold_lctx(struct listen_ctx *lctx)
{
refcount_acquire(&lctx->refcount);
}
static inline uint32_t
listen_hashfn(void *key, u_long mask)
{
return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask);
}
/*
* Add a listen_ctx entry to the listen hash table.
*/
static void
listen_hash_add(struct adapter *sc, struct listen_ctx *lctx)
{
struct tom_data *td = sc->tom_softc;
int bucket = listen_hashfn(lctx->inp, td->listen_mask);
mtx_lock(&td->lctx_hash_lock);
LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link);
td->lctx_count++;
mtx_unlock(&td->lctx_hash_lock);
}
/*
* Look for the listening socket's context entry in the hash and return it.
*/
static struct listen_ctx *
listen_hash_find(struct adapter *sc, struct inpcb *inp)
{
struct tom_data *td = sc->tom_softc;
int bucket = listen_hashfn(inp, td->listen_mask);
struct listen_ctx *lctx;
mtx_lock(&td->lctx_hash_lock);
LIST_FOREACH(lctx, &td->listen_hash[bucket], link) {
if (lctx->inp == inp)
break;
}
mtx_unlock(&td->lctx_hash_lock);
return (lctx);
}
/*
* Removes the listen_ctx structure for inp from the hash and returns it.
*/
static struct listen_ctx *
listen_hash_del(struct adapter *sc, struct inpcb *inp)
{
struct tom_data *td = sc->tom_softc;
int bucket = listen_hashfn(inp, td->listen_mask);
struct listen_ctx *lctx, *l;
mtx_lock(&td->lctx_hash_lock);
LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) {
if (lctx->inp == inp) {
LIST_REMOVE(lctx, link);
td->lctx_count--;
break;
}
}
mtx_unlock(&td->lctx_hash_lock);
return (lctx);
}
/*
* Releases a hold on the lctx. Must be called with the listening socket's inp
* locked. The inp may be freed by this function and it returns NULL to
* indicate this.
*/
static struct inpcb *
release_lctx(struct adapter *sc, struct listen_ctx *lctx)
{
struct inpcb *inp = lctx->inp;
int inp_freed = 0;
INP_WLOCK_ASSERT(inp);
if (refcount_release(&lctx->refcount))
inp_freed = free_lctx(sc, lctx);
return (inp_freed ? NULL : inp);
}
static void
send_flowc_wr_synqe(struct adapter *sc, struct synq_entry *synqe)
{
struct mbuf *m = synqe->syn;
if_t ifp = m->m_pkthdr.rcvif;
struct vi_info *vi = if_getsoftc(ifp);
struct port_info *pi = vi->pi;
struct wrqe *wr;
struct fw_flowc_wr *flowc;
struct sge_ofld_txq *ofld_txq;
struct sge_ofld_rxq *ofld_rxq;
const int nparams = 6;
const int flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
const u_int pfvf = sc->pf << S_FW_VIID_PFN;
INP_WLOCK_ASSERT(synqe->lctx->inp);
MPASS((synqe->flags & TPF_FLOWC_WR_SENT) == 0);
ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx];
ofld_rxq = &sc->sge.ofld_rxq[synqe->params.rxq_idx];
wr = alloc_wrqe(roundup2(flowclen, 16), &ofld_txq->wrq);
if (wr == NULL) {
/* XXX */
panic("%s: allocation failure.", __func__);
}
flowc = wrtod(wr);
memset(flowc, 0, wr->wr_len);
flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
V_FW_FLOWC_WR_NPARAMS(nparams));
flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
V_FW_WR_FLOWID(synqe->tid));
flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
flowc->mnemval[0].val = htobe32(pfvf);
flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
flowc->mnemval[1].val = htobe32(pi->tx_chan);
flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
flowc->mnemval[2].val = htobe32(pi->tx_chan);
flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id);
flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF;
flowc->mnemval[4].val = htobe32(512);
flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS;
flowc->mnemval[5].val = htobe32(512);
synqe->flags |= TPF_FLOWC_WR_SENT;
t4_wrq_tx(sc, wr);
}
static void
send_abort_rpl_synqe(struct toedev *tod, struct synq_entry *synqe,
int rst_status)
{
struct adapter *sc = tod->tod_softc;
struct wrqe *wr;
struct cpl_abort_req *req;
INP_WLOCK_ASSERT(synqe->lctx->inp);
CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s",
__func__, synqe, synqe->flags, synqe->tid,
synqe->flags & TPF_ABORT_SHUTDOWN ?
" (abort already in progress)" : "");
if (synqe->flags & TPF_ABORT_SHUTDOWN)
return; /* abort already in progress */
synqe->flags |= TPF_ABORT_SHUTDOWN;
if (!(synqe->flags & TPF_FLOWC_WR_SENT))
send_flowc_wr_synqe(sc, synqe);
wr = alloc_wrqe(sizeof(*req),
&sc->sge.ofld_txq[synqe->params.txq_idx].wrq);
if (wr == NULL) {
/* XXX */
panic("%s: allocation failure.", __func__);
}
req = wrtod(wr);
INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid);
req->rsvd0 = 0; /* don't have a snd_nxt */
req->rsvd1 = 1; /* no data sent yet */
req->cmd = rst_status;
t4_l2t_send(sc, wr, &sc->l2t->l2tab[synqe->params.l2t_idx]);
}
static int
create_server(struct adapter *sc, struct listen_ctx *lctx)
{
struct wrqe *wr;
struct cpl_pass_open_req *req;
struct inpcb *inp = lctx->inp;
wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
if (wr == NULL) {
log(LOG_ERR, "%s: allocation failure", __func__);
return (ENOMEM);
}
req = wrtod(wr);
INIT_TP_WR(req, 0);
OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid));
req->local_port = inp->inp_lport;
req->peer_port = 0;
req->local_ip = inp->inp_laddr.s_addr;
req->peer_ip = 0;
req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
t4_wrq_tx(sc, wr);
return (0);
}
static int
create_server6(struct adapter *sc, struct listen_ctx *lctx)
{
struct wrqe *wr;
struct cpl_pass_open_req6 *req;
struct inpcb *inp = lctx->inp;
wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
if (wr == NULL) {
log(LOG_ERR, "%s: allocation failure", __func__);
return (ENOMEM);
}
req = wrtod(wr);
INIT_TP_WR(req, 0);
OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid));
req->local_port = inp->inp_lport;
req->peer_port = 0;
req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0];
req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8];
req->peer_ip_hi = 0;
req->peer_ip_lo = 0;
req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
t4_wrq_tx(sc, wr);
return (0);
}
static int
destroy_server(struct adapter *sc, struct listen_ctx *lctx)
{
struct wrqe *wr;
struct cpl_close_listsvr_req *req;
wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
if (wr == NULL) {
/* XXX */
panic("%s: allocation failure.", __func__);
}
req = wrtod(wr);
INIT_TP_WR(req, 0);
OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ,
lctx->stid));
req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id);
req->rsvd = htobe16(0);
t4_wrq_tx(sc, wr);
return (0);
}
/*
* Start a listening server by sending a passive open request to HW.
*
* Can't take adapter lock here and access to sc->flags,
* sc->offload_map, if_capenable are all race prone.
*/
int
t4_listen_start(struct toedev *tod, struct tcpcb *tp)
{
struct adapter *sc = tod->tod_softc;
struct vi_info *vi;
struct port_info *pi;
struct inpcb *inp = tptoinpcb(tp);
struct listen_ctx *lctx;
int i, rc, v;
struct offload_settings settings;
INP_WLOCK_ASSERT(inp);
rw_rlock(&sc->policy_lock);
settings = *lookup_offload_policy(sc, OPEN_TYPE_LISTEN, NULL,
EVL_MAKETAG(0xfff, 0, 0), inp);
rw_runlock(&sc->policy_lock);
if (!settings.offload)
return (0);
/* Don't start a hardware listener for any loopback address. */
if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr))
return (0);
if (!(inp->inp_vflag & INP_IPV6) &&
IN_LOOPBACK(ntohl(inp->inp_laddr.s_addr)))
return (0);
if (sc->flags & KERN_TLS_ON)
return (0);
#if 0
ADAPTER_LOCK(sc);
if (IS_BUSY(sc)) {
log(LOG_ERR, "%s: listen request ignored, %s is busy",
__func__, device_get_nameunit(sc->dev));
goto done;
}
KASSERT(uld_active(sc, ULD_TOM),
("%s: TOM not initialized", __func__));
#endif
/*
* Find an initialized VI with IFCAP_TOE (4 or 6). We'll use the first
* such VI's queues to send the passive open and receive the reply to
* it.
*
* XXX: need a way to mark a port in use by offload. if_cxgbe should
* then reject any attempt to bring down such a port (and maybe reject
* attempts to disable IFCAP_TOE on that port too?).
*/
for_each_port(sc, i) {
pi = sc->port[i];
for_each_vi(pi, v, vi) {
if (vi->flags & VI_INIT_DONE &&
if_getcapenable(vi->ifp) & IFCAP_TOE)
goto found;
}
}
goto done; /* no port that's UP with IFCAP_TOE enabled */
found:
if (listen_hash_find(sc, inp) != NULL)
goto done; /* already setup */
lctx = alloc_lctx(sc, inp, vi);
if (lctx == NULL) {
log(LOG_ERR,
"%s: listen request ignored, %s couldn't allocate lctx\n",
__func__, device_get_nameunit(sc->dev));
goto done;
}
listen_hash_add(sc, lctx);
CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x",
__func__, lctx->stid, tcpstates[tp->t_state], lctx, inp,
inp->inp_vflag);
if (inp->inp_vflag & INP_IPV6)
rc = create_server6(sc, lctx);
else
rc = create_server(sc, lctx);
if (rc != 0) {
log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n",
__func__, device_get_nameunit(sc->dev), rc);
(void) listen_hash_del(sc, inp);
inp = release_lctx(sc, lctx);
/* can't be freed, host stack has a reference */
KASSERT(inp != NULL, ("%s: inp freed", __func__));
goto done;
}
lctx->flags |= LCTX_RPL_PENDING;
done:
#if 0
ADAPTER_UNLOCK(sc);
#endif
return (0);
}
int
t4_listen_stop(struct toedev *tod, struct tcpcb *tp)
{
struct listen_ctx *lctx;
struct adapter *sc = tod->tod_softc;
struct inpcb *inp = tptoinpcb(tp);
INP_WLOCK_ASSERT(inp);
lctx = listen_hash_del(sc, inp);
if (lctx == NULL)
return (ENOENT); /* no hardware listener for this inp */
CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid,
lctx, lctx->flags);
/*
* If the reply to the PASS_OPEN is still pending we'll wait for it to
* arrive and clean up when it does.
*/
if (lctx->flags & LCTX_RPL_PENDING) {
return (EINPROGRESS);
}
- destroy_server(sc, lctx);
+ if (lctx->flags & LCTX_SETUP_IN_HW)
+ destroy_server(sc, lctx);
return (0);
}
static inline struct synq_entry *
-alloc_synqe(struct adapter *sc __unused, struct listen_ctx *lctx, int flags)
+alloc_synqe(struct adapter *sc, struct listen_ctx *lctx, int flags)
{
struct synq_entry *synqe;
INP_RLOCK_ASSERT(lctx->inp);
MPASS(flags == M_WAITOK || flags == M_NOWAIT);
synqe = malloc(sizeof(*synqe), M_CXGBE, flags);
if (__predict_true(synqe != NULL)) {
synqe->flags = TPF_SYNQE;
+ synqe->incarnation = sc->incarnation;
refcount_init(&synqe->refcnt, 1);
synqe->lctx = lctx;
hold_lctx(lctx); /* Every synqe has a ref on its lctx. */
synqe->syn = NULL;
}
return (synqe);
}
static inline void
hold_synqe(struct synq_entry *synqe)
{
refcount_acquire(&synqe->refcnt);
}
static inline struct inpcb *
release_synqe(struct adapter *sc, struct synq_entry *synqe)
{
struct inpcb *inp;
MPASS(synqe->flags & TPF_SYNQE);
MPASS(synqe->lctx != NULL);
inp = synqe->lctx->inp;
MPASS(inp != NULL);
INP_WLOCK_ASSERT(inp);
if (refcount_release(&synqe->refcnt)) {
inp = release_lctx(sc, synqe->lctx);
m_freem(synqe->syn);
free(synqe, M_CXGBE);
}
return (inp);
}
void
t4_syncache_added(struct toedev *tod __unused, void *arg)
{
struct synq_entry *synqe = arg;
hold_synqe(synqe);
}
void
t4_syncache_removed(struct toedev *tod, void *arg)
{
struct adapter *sc = tod->tod_softc;
struct synq_entry *synqe = arg;
struct inpcb *inp = synqe->lctx->inp;
/*
* XXX: this is a LOR but harmless when running from the softclock.
*/
INP_WLOCK(inp);
inp = release_synqe(sc, synqe);
if (inp != NULL)
INP_WUNLOCK(inp);
}
int
t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m)
{
struct synq_entry *synqe = arg;
if (atomic_fetchadd_int(&synqe->ok_to_respond, 1) == 0) {
struct tcpopt to;
struct ip *ip = mtod(m, struct ip *);
struct tcphdr *th;
if (ip->ip_v == IPVERSION)
th = (void *)(ip + 1);
else
th = (void *)((struct ip6_hdr *)ip + 1);
bzero(&to, sizeof(to));
tcp_dooptions(&to, (void *)(th + 1),
(th->th_off << 2) - sizeof(*th), TO_SYN);
/* save these for later */
synqe->iss = be32toh(th->th_seq);
synqe->irs = be32toh(th->th_ack) - 1;
synqe->ts = to.to_tsval;
}
m_freem(m); /* don't need this any more */
return (0);
}
static int
do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
struct mbuf *m)
{
struct adapter *sc = iq->adapter;
const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1);
int stid = GET_TID(cpl);
unsigned int status = cpl->status;
struct listen_ctx *lctx = lookup_stid(sc, stid);
struct inpcb *inp = lctx->inp;
#ifdef INVARIANTS
unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
#endif
KASSERT(opcode == CPL_PASS_OPEN_RPL,
("%s: unexpected opcode 0x%x", __func__, opcode));
KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
INP_WLOCK(inp);
CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x",
__func__, stid, status, lctx->flags);
lctx->flags &= ~LCTX_RPL_PENDING;
-
- if (status != CPL_ERR_NONE)
+ if (status == CPL_ERR_NONE)
+ lctx->flags |= LCTX_SETUP_IN_HW;
+ else
log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status);
#ifdef INVARIANTS
/*
* If the inp has been dropped (listening socket closed) then
* listen_stop must have run and taken the inp out of the hash.
*/
if (inp->inp_flags & INP_DROPPED) {
KASSERT(listen_hash_del(sc, inp) == NULL,
("%s: inp %p still in listen hash", __func__, inp));
}
#endif
if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) {
if (release_lctx(sc, lctx) != NULL)
INP_WUNLOCK(inp);
return (status);
}
/*
* Listening socket stopped listening earlier and now the chip tells us
* it has started the hardware listener. Stop it; the lctx will be
* released in do_close_server_rpl.
*/
if (inp->inp_flags & INP_DROPPED) {
destroy_server(sc, lctx);
INP_WUNLOCK(inp);
return (status);
}
/*
* Failed to start hardware listener. Take inp out of the hash and
* release our reference on it. An error message has been logged
* already.
*/
if (status != CPL_ERR_NONE) {
listen_hash_del(sc, inp);
if (release_lctx(sc, lctx) != NULL)
INP_WUNLOCK(inp);
return (status);
}
/* hardware listener open for business */
INP_WUNLOCK(inp);
return (status);
}
static int
do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss,
struct mbuf *m)
{
struct adapter *sc = iq->adapter;
const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1);
int stid = GET_TID(cpl);
unsigned int status = cpl->status;
struct listen_ctx *lctx = lookup_stid(sc, stid);
struct inpcb *inp = lctx->inp;
#ifdef INVARIANTS
unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
#endif
KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL,
("%s: unexpected opcode 0x%x", __func__, opcode));
KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status);
if (status != CPL_ERR_NONE) {
log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n",
__func__, status, stid);
return (status);
}
INP_WLOCK(inp);
inp = release_lctx(sc, lctx);
if (inp != NULL)
INP_WUNLOCK(inp);
return (status);
}
static void
done_with_synqe(struct adapter *sc, struct synq_entry *synqe)
{
struct tom_data *td = sc->tom_softc;
struct listen_ctx *lctx = synqe->lctx;
struct inpcb *inp = lctx->inp;
struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx];
int ntids;
INP_WLOCK_ASSERT(inp);
- ntids = inp->inp_vflag & INP_IPV6 ? 2 : 1;
- remove_tid(sc, synqe->tid, ntids);
- mtx_lock(&td->toep_list_lock);
- TAILQ_REMOVE(&td->synqe_list, synqe, link);
- mtx_unlock(&td->toep_list_lock);
- release_tid(sc, synqe->tid, lctx->ctrlq);
+ if (synqe->tid != -1) {
+ ntids = inp->inp_vflag & INP_IPV6 ? 2 : 1;
+ remove_tid(sc, synqe->tid, ntids);
+ mtx_lock(&td->toep_list_lock);
+ TAILQ_REMOVE(&td->synqe_list, synqe, link);
+ mtx_unlock(&td->toep_list_lock);
+ release_tid(sc, synqe->tid, lctx->ctrlq);
+ }
t4_l2t_release(e);
inp = release_synqe(sc, synqe);
if (inp)
INP_WUNLOCK(inp);
}
void
-synack_failure_cleanup(struct adapter *sc, int tid)
+synack_failure_cleanup(struct adapter *sc, struct synq_entry *synqe)
{
- struct synq_entry *synqe = lookup_tid(sc, tid);
-
INP_WLOCK(synqe->lctx->inp);
done_with_synqe(sc, synqe);
}
int
do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss,
struct mbuf *m)
{
struct adapter *sc = iq->adapter;
const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
unsigned int tid = GET_TID(cpl);
struct synq_entry *synqe = lookup_tid(sc, tid);
struct listen_ctx *lctx = synqe->lctx;
struct inpcb *inp = lctx->inp;
struct sge_ofld_txq *ofld_txq;
#ifdef INVARIANTS
unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
#endif
KASSERT(opcode == CPL_ABORT_REQ_RSS,
("%s: unexpected opcode 0x%x", __func__, opcode));
KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
__func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
if (negative_advice(cpl->status))
return (0); /* Ignore negative advice */
INP_WLOCK(inp);
ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx];
if (!(synqe->flags & TPF_FLOWC_WR_SENT))
send_flowc_wr_synqe(sc, synqe);
/*
* If we'd initiated an abort earlier the reply to it is responsible for
* cleaning up resources. Otherwise we tear everything down right here
* right now. We owe the T4 a CPL_ABORT_RPL no matter what.
*/
if (synqe->flags & TPF_ABORT_SHUTDOWN) {
INP_WUNLOCK(inp);
goto done;
}
done_with_synqe(sc, synqe);
/* inp lock released by done_with_synqe */
done:
send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
return (0);
}
int
do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss,
struct mbuf *m)
{
struct adapter *sc = iq->adapter;
const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
unsigned int tid = GET_TID(cpl);
struct synq_entry *synqe = lookup_tid(sc, tid);
struct listen_ctx *lctx = synqe->lctx;
struct inpcb *inp = lctx->inp;
#ifdef INVARIANTS
unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
#endif
KASSERT(opcode == CPL_ABORT_RPL_RSS,
("%s: unexpected opcode 0x%x", __func__, opcode));
KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
__func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
INP_WLOCK(inp);
KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
("%s: wasn't expecting abort reply for synqe %p (0x%x)",
__func__, synqe, synqe->flags));
done_with_synqe(sc, synqe);
/* inp lock released by done_with_synqe */
return (0);
}
void
t4_offload_socket(struct toedev *tod, void *arg, struct socket *so)
{
struct adapter *sc = tod->tod_softc;
struct tom_data *td = sc->tom_softc;
struct synq_entry *synqe = arg;
struct inpcb *inp = sotoinpcb(so);
struct toepcb *toep = synqe->toep;
NET_EPOCH_ASSERT(); /* prevents bad race with accept() */
INP_WLOCK_ASSERT(inp);
KASSERT(synqe->flags & TPF_SYNQE,
("%s: %p not a synq_entry?", __func__, arg));
MPASS(toep->tid == synqe->tid);
offload_socket(so, toep);
make_established(toep, synqe->iss, synqe->irs, synqe->tcp_opt);
toep->flags |= TPF_CPL_PENDING;
update_tid(sc, synqe->tid, toep);
synqe->flags |= TPF_SYNQE_EXPANDED;
mtx_lock(&td->toep_list_lock);
TAILQ_REMOVE(&td->synqe_list, synqe, link);
mtx_unlock(&td->toep_list_lock);
inp->inp_flowtype = (inp->inp_vflag & INP_IPV6) ?
M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4;
inp->inp_flowid = synqe->rss_hash;
}
static void
t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to)
{
bzero(to, sizeof(*to));
if (t4opt->mss) {
to->to_flags |= TOF_MSS;
to->to_mss = be16toh(t4opt->mss);
}
if (t4opt->wsf > 0 && t4opt->wsf < 15) {
to->to_flags |= TOF_SCALE;
to->to_wscale = t4opt->wsf;
}
if (t4opt->tstamp)
to->to_flags |= TOF_TS;
if (t4opt->sack)
to->to_flags |= TOF_SACKPERM;
}
static bool
encapsulated_syn(struct adapter *sc, const struct cpl_pass_accept_req *cpl)
{
u_int hlen = be32toh(cpl->hdr_len);
if (chip_id(sc) >= CHELSIO_T6)
return (G_T6_ETH_HDR_LEN(hlen) > sizeof(struct ether_vlan_header));
else
return (G_ETH_HDR_LEN(hlen) > sizeof(struct ether_vlan_header));
}
static void
pass_accept_req_to_protohdrs(struct adapter *sc, const struct mbuf *m,
struct in_conninfo *inc, struct tcphdr *th, uint8_t *iptos)
{
const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
const struct ether_header *eh;
unsigned int hlen = be32toh(cpl->hdr_len);
uintptr_t l3hdr;
const struct tcphdr *tcp;
eh = (const void *)(cpl + 1);
if (chip_id(sc) >= CHELSIO_T6) {
l3hdr = ((uintptr_t)eh + G_T6_ETH_HDR_LEN(hlen));
tcp = (const void *)(l3hdr + G_T6_IP_HDR_LEN(hlen));
} else {
l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen));
tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen));
}
/* extract TOS (DiffServ + ECN) byte for AccECN */
if (iptos) {
if (((struct ip *)l3hdr)->ip_v == IPVERSION) {
const struct ip *ip = (const void *)l3hdr;
*iptos = ip->ip_tos;
}
#ifdef INET6
else
if (((struct ip *)l3hdr)->ip_v == (IPV6_VERSION >> 4)) {
const struct ip6_hdr *ip6 = (const void *)l3hdr;
*iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
}
#endif /* INET */
}
if (inc) {
bzero(inc, sizeof(*inc));
inc->inc_fport = tcp->th_sport;
inc->inc_lport = tcp->th_dport;
if (((struct ip *)l3hdr)->ip_v == IPVERSION) {
const struct ip *ip = (const void *)l3hdr;
inc->inc_faddr = ip->ip_src;
inc->inc_laddr = ip->ip_dst;
} else {
const struct ip6_hdr *ip6 = (const void *)l3hdr;
inc->inc_flags |= INC_ISIPV6;
inc->inc6_faddr = ip6->ip6_src;
inc->inc6_laddr = ip6->ip6_dst;
}
}
if (th) {
bcopy(tcp, th, sizeof(*th));
tcp_fields_to_host(th); /* just like tcp_input */
}
}
static struct l2t_entry *
get_l2te_for_nexthop(struct port_info *pi, if_t ifp,
struct in_conninfo *inc)
{
struct l2t_entry *e;
struct sockaddr_in6 sin6;
struct sockaddr *dst = (void *)&sin6;
struct nhop_object *nh;
if (inc->inc_flags & INC_ISIPV6) {
bzero(dst, sizeof(struct sockaddr_in6));
dst->sa_len = sizeof(struct sockaddr_in6);
dst->sa_family = AF_INET6;
if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) {
/* no need for route lookup */
e = t4_l2t_get(pi, ifp, dst);
return (e);
}
nh = fib6_lookup(RT_DEFAULT_FIB, &inc->inc6_faddr, 0, NHR_NONE, 0);
if (nh == NULL)
return (NULL);
if (nh->nh_ifp != ifp)
return (NULL);
if (nh->nh_flags & NHF_GATEWAY)
((struct sockaddr_in6 *)dst)->sin6_addr = nh->gw6_sa.sin6_addr;
else
((struct sockaddr_in6 *)dst)->sin6_addr = inc->inc6_faddr;
} else {
dst->sa_len = sizeof(struct sockaddr_in);
dst->sa_family = AF_INET;
nh = fib4_lookup(RT_DEFAULT_FIB, inc->inc_faddr, 0, NHR_NONE, 0);
if (nh == NULL)
return (NULL);
if (nh->nh_ifp != ifp)
return (NULL);
if (nh->nh_flags & NHF_GATEWAY)
if (nh->gw_sa.sa_family == AF_INET)
((struct sockaddr_in *)dst)->sin_addr = nh->gw4_sa.sin_addr;
else
*((struct sockaddr_in6 *)dst) = nh->gw6_sa;
else
((struct sockaddr_in *)dst)->sin_addr = inc->inc_faddr;
}
e = t4_l2t_get(pi, ifp, dst);
return (e);
}
static int
send_synack(struct adapter *sc, struct synq_entry *synqe, uint64_t opt0,
uint32_t opt2, int tid)
{
struct wrqe *wr;
struct cpl_pass_accept_rpl *rpl;
struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx];
wr = alloc_wrqe(is_t4(sc) ? sizeof(struct cpl_pass_accept_rpl) :
sizeof(struct cpl_t5_pass_accept_rpl), &sc->sge.ctrlq[0]);
if (wr == NULL)
return (ENOMEM);
rpl = wrtod(wr);
if (is_t4(sc))
INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid);
else {
struct cpl_t5_pass_accept_rpl *rpl5 = (void *)rpl;
INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid);
rpl5->iss = htobe32(synqe->iss);
}
rpl->opt0 = opt0;
rpl->opt2 = opt2;
return (t4_l2t_send(sc, wr, e));
}
#define REJECT_PASS_ACCEPT_REQ(tunnel) do { \
if (!tunnel) { \
m_freem(m); \
m = NULL; \
} \
reject_reason = __LINE__; \
goto reject; \
} while (0)
/*
* The context associated with a tid entry via insert_tid could be a synq_entry
* or a toepcb. The only way CPL handlers can tell is via a bit in these flags.
*/
CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags));
/*
* Incoming SYN on a listening socket.
*
* XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe,
* etc.
*/
static int
do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss,
struct mbuf *m)
{
struct adapter *sc = iq->adapter;
struct tom_data *td = sc->tom_softc;
struct toedev *tod;
const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
unsigned int tid = GET_TID(cpl);
struct listen_ctx *lctx = lookup_stid(sc, stid);
struct inpcb *inp;
struct socket *so;
struct in_conninfo inc;
struct tcphdr th;
struct tcpopt to;
struct port_info *pi;
struct vi_info *vi;
if_t hw_ifp, ifp;
struct l2t_entry *e = NULL;
struct synq_entry *synqe = NULL;
int reject_reason, v, ntids;
uint16_t vid, l2info;
struct epoch_tracker et;
#ifdef INVARIANTS
unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
#endif
struct offload_settings settings;
uint8_t iptos;
KASSERT(opcode == CPL_PASS_ACCEPT_REQ,
("%s: unexpected opcode 0x%x", __func__, opcode));
KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid,
lctx);
/*
* Figure out the port the SYN arrived on. We'll look for an exact VI
* match in a bit but in case we don't find any we'll use the main VI as
* the incoming ifnet.
*/
l2info = be16toh(cpl->l2info);
pi = sc->port[G_SYN_INTF(l2info)];
hw_ifp = pi->vi[0].ifp;
m->m_pkthdr.rcvif = hw_ifp;
CURVNET_SET(lctx->vnet); /* before any potential REJECT */
/*
* If VXLAN/NVGRE parsing is enabled then SYNs in the inner traffic will
* also hit the listener. We don't want to offload those.
*/
if (encapsulated_syn(sc, cpl)) {
REJECT_PASS_ACCEPT_REQ(true);
}
/*
* Use the MAC index to lookup the associated VI. If this SYN didn't
* match a perfect MAC filter, punt.
*/
if (!(l2info & F_SYN_XACT_MATCH)) {
REJECT_PASS_ACCEPT_REQ(true);
}
for_each_vi(pi, v, vi) {
if (vi->xact_addr_filt == G_SYN_MAC_IDX(l2info))
goto found;
}
REJECT_PASS_ACCEPT_REQ(true);
found:
hw_ifp = vi->ifp; /* the cxgbe ifnet */
m->m_pkthdr.rcvif = hw_ifp;
tod = TOEDEV(hw_ifp);
/*
* Don't offload if the peer requested a TCP option that's not known to
* the silicon. Send the SYN to the kernel instead.
*/
if (__predict_false(cpl->tcpopt.unknown))
REJECT_PASS_ACCEPT_REQ(true);
/*
* Figure out if there is a pseudo interface (vlan, lagg, etc.)
* involved. Don't offload if the SYN had a VLAN tag and the vid
* doesn't match anything on this interface.
*
* XXX: lagg support, lagg + vlan support.
*/
vid = EVL_VLANOFTAG(be16toh(cpl->vlan));
if (vid != 0xfff && vid != 0) {
ifp = VLAN_DEVAT(hw_ifp, vid);
if (ifp == NULL)
REJECT_PASS_ACCEPT_REQ(true);
} else
ifp = hw_ifp;
/*
* Don't offload if the ifnet that the SYN came in on is not in the same
* vnet as the listening socket.
*/
if (lctx->vnet != if_getvnet(ifp))
REJECT_PASS_ACCEPT_REQ(true);
pass_accept_req_to_protohdrs(sc, m, &inc, &th, &iptos);
if (inc.inc_flags & INC_ISIPV6) {
/* Don't offload if the ifcap isn't enabled */
if ((if_getcapenable(ifp) & IFCAP_TOE6) == 0)
REJECT_PASS_ACCEPT_REQ(true);
/*
* SYN must be directed to an IP6 address on this ifnet. This
* is more restrictive than in6_localip.
*/
NET_EPOCH_ENTER(et);
if (!in6_ifhasaddr(ifp, &inc.inc6_laddr)) {
NET_EPOCH_EXIT(et);
REJECT_PASS_ACCEPT_REQ(true);
}
ntids = 2;
} else {
/* Don't offload if the ifcap isn't enabled */
if ((if_getcapenable(ifp) & IFCAP_TOE4) == 0)
REJECT_PASS_ACCEPT_REQ(true);
/*
* SYN must be directed to an IP address on this ifnet. This
* is more restrictive than in_localip.
*/
NET_EPOCH_ENTER(et);
if (!in_ifhasaddr(ifp, inc.inc_laddr)) {
NET_EPOCH_EXIT(et);
REJECT_PASS_ACCEPT_REQ(true);
}
ntids = 1;
}
e = get_l2te_for_nexthop(pi, ifp, &inc);
if (e == NULL) {
NET_EPOCH_EXIT(et);
REJECT_PASS_ACCEPT_REQ(true);
}
/* Don't offload if the 4-tuple is already in use */
if (toe_4tuple_check(&inc, &th, ifp) != 0) {
NET_EPOCH_EXIT(et);
REJECT_PASS_ACCEPT_REQ(false);
}
inp = lctx->inp; /* listening socket, not owned by TOE */
INP_RLOCK(inp);
/* Don't offload if the listening socket has closed */
if (__predict_false(inp->inp_flags & INP_DROPPED)) {
INP_RUNLOCK(inp);
NET_EPOCH_EXIT(et);
REJECT_PASS_ACCEPT_REQ(false);
}
so = inp->inp_socket;
rw_rlock(&sc->policy_lock);
settings = *lookup_offload_policy(sc, OPEN_TYPE_PASSIVE, m,
EVL_MAKETAG(0xfff, 0, 0), inp);
rw_runlock(&sc->policy_lock);
if (!settings.offload) {
INP_RUNLOCK(inp);
NET_EPOCH_EXIT(et);
REJECT_PASS_ACCEPT_REQ(true); /* Rejected by COP. */
}
synqe = alloc_synqe(sc, lctx, M_NOWAIT);
if (synqe == NULL) {
INP_RUNLOCK(inp);
NET_EPOCH_EXIT(et);
REJECT_PASS_ACCEPT_REQ(true);
}
MPASS(rss->hash_type == RSS_HASH_TCP);
synqe->rss_hash = be32toh(rss->hash_val);
atomic_store_int(&synqe->ok_to_respond, 0);
init_conn_params(vi, &settings, &inc, so, &cpl->tcpopt, e->idx,
&synqe->params);
/*
* If all goes well t4_syncache_respond will get called during
* syncache_add. Note that syncache_add releases the pcb lock.
*/
t4opt_to_tcpopt(&cpl->tcpopt, &to);
toe_syncache_add(&inc, &to, &th, inp, tod, synqe, iptos);
if (atomic_load_int(&synqe->ok_to_respond) > 0) {
uint64_t opt0;
uint32_t opt2;
opt0 = calc_options0(vi, &synqe->params);
opt2 = calc_options2(vi, &synqe->params);
insert_tid(sc, tid, synqe, ntids);
synqe->tid = tid;
synqe->syn = m;
m = NULL;
if (send_synack(sc, synqe, opt0, opt2, tid) != 0) {
remove_tid(sc, tid, ntids);
m = synqe->syn;
synqe->syn = NULL;
NET_EPOCH_EXIT(et);
REJECT_PASS_ACCEPT_REQ(true);
}
mtx_lock(&td->toep_list_lock);
TAILQ_INSERT_TAIL(&td->synqe_list, synqe, link);
mtx_unlock(&td->toep_list_lock);
CTR6(KTR_CXGBE,
"%s: stid %u, tid %u, synqe %p, opt0 %#016lx, opt2 %#08x",
__func__, stid, tid, synqe, be64toh(opt0), be32toh(opt2));
} else {
NET_EPOCH_EXIT(et);
REJECT_PASS_ACCEPT_REQ(false);
}
NET_EPOCH_EXIT(et);
CURVNET_RESTORE();
return (0);
reject:
CURVNET_RESTORE();
CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid,
reject_reason);
if (e)
t4_l2t_release(e);
release_tid(sc, tid, lctx->ctrlq);
if (synqe) {
inp = synqe->lctx->inp;
INP_WLOCK(inp);
inp = release_synqe(sc, synqe);
if (inp)
INP_WUNLOCK(inp);
}
if (m) {
/*
* The connection request hit a TOE listener but is being passed
* on to the kernel sw stack instead of getting offloaded.
*/
m_adj(m, sizeof(*cpl));
m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID |
CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
m->m_pkthdr.csum_data = 0xffff;
if_input(hw_ifp, m);
}
return (reject_reason);
}
static void
synqe_to_protohdrs(struct adapter *sc, struct synq_entry *synqe,
const struct cpl_pass_establish *cpl, struct in_conninfo *inc,
struct tcphdr *th, struct tcpopt *to)
{
uint16_t tcp_opt = be16toh(cpl->tcp_opt);
uint8_t iptos;
/* start off with the original SYN */
pass_accept_req_to_protohdrs(sc, synqe->syn, inc, th, &iptos);
/* modify parts to make it look like the ACK to our SYN|ACK */
th->th_flags = TH_ACK;
th->th_ack = synqe->iss + 1;
th->th_seq = be32toh(cpl->rcv_isn);
bzero(to, sizeof(*to));
if (G_TCPOPT_TSTAMP(tcp_opt)) {
to->to_flags |= TOF_TS;
to->to_tsecr = synqe->ts;
}
}
static int
do_pass_establish(struct sge_iq *iq, const struct rss_header *rss,
struct mbuf *m)
{
struct adapter *sc = iq->adapter;
struct vi_info *vi;
if_t ifp;
const struct cpl_pass_establish *cpl = (const void *)(rss + 1);
#if defined(KTR) || defined(INVARIANTS)
unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
#endif
unsigned int tid = GET_TID(cpl);
struct synq_entry *synqe = lookup_tid(sc, tid);
struct listen_ctx *lctx = synqe->lctx;
struct inpcb *inp = lctx->inp, *new_inp;
struct socket *so;
struct tcphdr th;
struct tcpopt to;
struct in_conninfo inc;
struct toepcb *toep;
struct epoch_tracker et;
int rstreason;
#ifdef INVARIANTS
unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
#endif
KASSERT(opcode == CPL_PASS_ESTABLISH,
("%s: unexpected opcode 0x%x", __func__, opcode));
KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
KASSERT(synqe->flags & TPF_SYNQE,
("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe));
CURVNET_SET(lctx->vnet);
NET_EPOCH_ENTER(et); /* for syncache_expand */
INP_WLOCK(inp);
CTR6(KTR_CXGBE,
"%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x",
__func__, stid, tid, synqe, synqe->flags, inp->inp_flags);
ifp = synqe->syn->m_pkthdr.rcvif;
vi = if_getsoftc(ifp);
KASSERT(vi->adapter == sc,
("%s: vi %p, sc %p mismatch", __func__, vi, sc));
if (__predict_false(inp->inp_flags & INP_DROPPED)) {
reset:
send_abort_rpl_synqe(TOEDEV(ifp), synqe, CPL_ABORT_SEND_RST);
INP_WUNLOCK(inp);
NET_EPOCH_EXIT(et);
CURVNET_RESTORE();
return (0);
}
KASSERT(synqe->params.rxq_idx == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0],
("%s: CPL arrived on unexpected rxq. %d %d", __func__,
synqe->params.rxq_idx,
(int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0])));
toep = alloc_toepcb(vi, M_NOWAIT);
if (toep == NULL)
goto reset;
toep->tid = tid;
toep->l2te = &sc->l2t->l2tab[synqe->params.l2t_idx];
toep->vnet = lctx->vnet;
bcopy(&synqe->params, &toep->params, sizeof(toep->params));
init_toepcb(vi, toep);
MPASS(be32toh(cpl->snd_isn) - 1 == synqe->iss);
MPASS(be32toh(cpl->rcv_isn) - 1 == synqe->irs);
synqe->tcp_opt = cpl->tcp_opt;
synqe->toep = toep;
/* Come up with something that syncache_expand should be ok with. */
synqe_to_protohdrs(sc, synqe, cpl, &inc, &th, &to);
if (inc.inc_flags & INC_ISIPV6) {
if (lctx->ce == NULL) {
toep->ce = t4_get_clip_entry(sc, &inc.inc6_laddr, true);
if (toep->ce == NULL) {
free_toepcb(toep);
goto reset; /* RST without a CLIP entry? */
}
} else {
t4_hold_clip_entry(sc, lctx->ce);
toep->ce = lctx->ce;
}
}
so = inp->inp_socket;
KASSERT(so != NULL, ("%s: socket is NULL", __func__));
rstreason = toe_syncache_expand(&inc, &to, &th, &so);
if (rstreason < 0) {
free_toepcb(toep);
send_abort_rpl_synqe(TOEDEV(ifp), synqe, CPL_ABORT_NO_RST);
INP_WUNLOCK(inp);
NET_EPOCH_EXIT(et);
CURVNET_RESTORE();
return (0);
} else if (rstreason == 0 || so == NULL) {
free_toepcb(toep);
goto reset;
}
/* New connection inpcb is already locked by syncache_expand(). */
new_inp = sotoinpcb(so);
INP_WLOCK_ASSERT(new_inp);
MPASS(so->so_vnet == lctx->vnet);
/*
* This is for expansion from syncookies.
*
* XXX: we've held the tcbinfo lock throughout so there's no risk of
* anyone accept'ing a connection before we've installed our hooks, but
* this somewhat defeats the purpose of having a tod_offload_socket :-(
*/
if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) {
tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0);
t4_offload_socket(TOEDEV(ifp), synqe, so);
}
INP_WUNLOCK(new_inp);
/* Done with the synqe */
inp = release_synqe(sc, synqe);
if (inp != NULL)
INP_WUNLOCK(inp);
NET_EPOCH_EXIT(et);
CURVNET_RESTORE();
return (0);
}
void
t4_init_listen_cpl_handlers(void)
{
t4_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl);
t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl);
t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
t4_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
}
void
t4_uninit_listen_cpl_handlers(void)
{
t4_register_cpl_handler(CPL_PASS_OPEN_RPL, NULL);
t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, NULL);
t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, NULL);
t4_register_cpl_handler(CPL_PASS_ESTABLISH, NULL);
}
#endif
diff --git a/sys/dev/cxgbe/tom/t4_tom.c b/sys/dev/cxgbe/tom/t4_tom.c
index fb92c88aa358..58a77ff93c86 100644
--- a/sys/dev/cxgbe/tom/t4_tom.c
+++ b/sys/dev/cxgbe/tom/t4_tom.c
@@ -1,2088 +1,2302 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2012 Chelsio Communications, Inc.
* All rights reserved.
* Written by: Navdeep Parhar <np@FreeBSD.org>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_kern_tls.h"
#include "opt_ratelimit.h"
#include <sys/param.h>
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/lock.h>
#include <sys/limits.h>
#include <sys/module.h>
#include <sys/protosw.h>
#include <sys/domain.h>
#include <sys/refcount.h>
#include <sys/rmlock.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <sys/taskqueue.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/if_types.h>
#include <net/if_vlan_var.h>
#include <netinet/in.h>
#include <netinet/in_pcb.h>
#include <netinet/in_var.h>
#include <netinet/ip.h>
#include <netinet/ip6.h>
#include <netinet6/scope6_var.h>
#define TCPSTATES
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/toecore.h>
#include <netinet/cc/cc.h>
#ifdef TCP_OFFLOAD
#include "common/common.h"
#include "common/t4_msg.h"
#include "common/t4_regs.h"
#include "common/t4_regs_values.h"
#include "common/t4_tcb.h"
#include "t4_clip.h"
#include "tom/t4_tom_l2t.h"
#include "tom/t4_tom.h"
#include "tom/t4_tls.h"
static struct protosw toe_protosw;
static struct protosw toe6_protosw;
/* Module ops */
static int t4_tom_mod_load(void);
static int t4_tom_mod_unload(void);
static int t4_tom_modevent(module_t, int, void *);
/* ULD ops and helpers */
static int t4_tom_activate(struct adapter *);
static int t4_tom_deactivate(struct adapter *);
+static int t4_tom_stop(struct adapter *);
+static int t4_tom_restart(struct adapter *);
static struct uld_info tom_uld_info = {
.uld_activate = t4_tom_activate,
.uld_deactivate = t4_tom_deactivate,
+ .uld_stop = t4_tom_stop,
+ .uld_restart = t4_tom_restart,
};
static void release_offload_resources(struct toepcb *);
static void done_with_toepcb(struct toepcb *);
-static int alloc_tid_tabs(struct tid_info *);
-static void free_tid_tabs(struct tid_info *);
+static int alloc_tid_tabs(struct adapter *);
+static void free_tid_tabs(struct adapter *);
static void free_tom_data(struct adapter *, struct tom_data *);
static void reclaim_wr_resources(void *, int);
+static void cleanup_stranded_tids(void *, int);
struct toepcb *
alloc_toepcb(struct vi_info *vi, int flags)
{
struct port_info *pi = vi->pi;
struct adapter *sc = pi->adapter;
struct toepcb *toep;
int tx_credits, txsd_total, len;
/*
* The firmware counts tx work request credits in units of 16 bytes
* each. Reserve room for an ABORT_REQ so the driver never has to worry
* about tx credits if it wants to abort a connection.
*/
tx_credits = sc->params.ofldq_wr_cred;
tx_credits -= howmany(sizeof(struct cpl_abort_req), 16);
/*
* Shortest possible tx work request is a fw_ofld_tx_data_wr + 1 byte
* immediate payload, and firmware counts tx work request credits in
* units of 16 byte. Calculate the maximum work requests possible.
*/
txsd_total = tx_credits /
howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16);
len = offsetof(struct toepcb, txsd) +
txsd_total * sizeof(struct ofld_tx_sdesc);
toep = malloc(len, M_CXGBE, M_ZERO | flags);
if (toep == NULL)
return (NULL);
refcount_init(&toep->refcount, 1);
toep->td = sc->tom_softc;
+ toep->incarnation = sc->incarnation;
toep->vi = vi;
toep->tid = -1;
toep->tx_total = tx_credits;
toep->tx_credits = tx_credits;
mbufq_init(&toep->ulp_pduq, INT_MAX);
mbufq_init(&toep->ulp_pdu_reclaimq, INT_MAX);
toep->txsd_total = txsd_total;
toep->txsd_avail = txsd_total;
toep->txsd_pidx = 0;
toep->txsd_cidx = 0;
aiotx_init_toep(toep);
return (toep);
}
/*
* Initialize a toepcb after its params have been filled out.
*/
int
init_toepcb(struct vi_info *vi, struct toepcb *toep)
{
struct conn_params *cp = &toep->params;
struct port_info *pi = vi->pi;
struct adapter *sc = pi->adapter;
struct tx_cl_rl_params *tc;
if (cp->tc_idx >= 0 && cp->tc_idx < sc->params.nsched_cls) {
tc = &pi->sched_params->cl_rl[cp->tc_idx];
mtx_lock(&sc->tc_lock);
if (tc->state != CS_HW_CONFIGURED) {
CH_ERR(vi, "tid %d cannot be bound to traffic class %d "
"because it is not configured (its state is %d)\n",
toep->tid, cp->tc_idx, tc->state);
cp->tc_idx = -1;
} else {
tc->refcount++;
}
mtx_unlock(&sc->tc_lock);
}
toep->ofld_txq = &sc->sge.ofld_txq[cp->txq_idx];
toep->ofld_rxq = &sc->sge.ofld_rxq[cp->rxq_idx];
toep->ctrlq = &sc->sge.ctrlq[pi->port_id];
tls_init_toep(toep);
MPASS(ulp_mode(toep) != ULP_MODE_TCPDDP);
toep->flags |= TPF_INITIALIZED;
return (0);
}
struct toepcb *
hold_toepcb(struct toepcb *toep)
{
refcount_acquire(&toep->refcount);
return (toep);
}
void
free_toepcb(struct toepcb *toep)
{
if (refcount_release(&toep->refcount) == 0)
return;
KASSERT(!(toep->flags & TPF_ATTACHED),
("%s: attached to an inpcb", __func__));
KASSERT(!(toep->flags & TPF_CPL_PENDING),
("%s: CPL pending", __func__));
if (toep->flags & TPF_INITIALIZED) {
if (ulp_mode(toep) == ULP_MODE_TCPDDP)
ddp_uninit_toep(toep);
tls_uninit_toep(toep);
}
free(toep, M_CXGBE);
}
/*
* Set up the socket for TCP offload.
*/
void
offload_socket(struct socket *so, struct toepcb *toep)
{
struct tom_data *td = toep->td;
struct inpcb *inp = sotoinpcb(so);
struct tcpcb *tp = intotcpcb(inp);
struct sockbuf *sb;
INP_WLOCK_ASSERT(inp);
/* Update socket */
sb = &so->so_snd;
SOCKBUF_LOCK(sb);
sb->sb_flags |= SB_NOCOALESCE;
SOCKBUF_UNLOCK(sb);
sb = &so->so_rcv;
SOCKBUF_LOCK(sb);
sb->sb_flags |= SB_NOCOALESCE;
if (inp->inp_vflag & INP_IPV6)
so->so_proto = &toe6_protosw;
else
so->so_proto = &toe_protosw;
SOCKBUF_UNLOCK(sb);
/* Update TCP PCB */
tp->tod = &td->tod;
tp->t_toe = toep;
tp->t_flags |= TF_TOE;
/* Install an extra hold on inp */
toep->inp = inp;
toep->flags |= TPF_ATTACHED;
in_pcbref(inp);
/* Add the TOE PCB to the active list */
mtx_lock(&td->toep_list_lock);
TAILQ_INSERT_HEAD(&td->toep_list, toep, link);
+ toep->flags |= TPF_IN_TOEP_LIST;
mtx_unlock(&td->toep_list_lock);
}
void
restore_so_proto(struct socket *so, bool v6)
{
if (v6)
so->so_proto = &tcp6_protosw;
else
so->so_proto = &tcp_protosw;
}
/* This is _not_ the normal way to "unoffload" a socket. */
void
undo_offload_socket(struct socket *so)
{
struct inpcb *inp = sotoinpcb(so);
struct tcpcb *tp = intotcpcb(inp);
struct toepcb *toep = tp->t_toe;
struct tom_data *td = toep->td;
struct sockbuf *sb;
INP_WLOCK_ASSERT(inp);
sb = &so->so_snd;
SOCKBUF_LOCK(sb);
sb->sb_flags &= ~SB_NOCOALESCE;
SOCKBUF_UNLOCK(sb);
sb = &so->so_rcv;
SOCKBUF_LOCK(sb);
sb->sb_flags &= ~SB_NOCOALESCE;
restore_so_proto(so, inp->inp_vflag & INP_IPV6);
SOCKBUF_UNLOCK(sb);
tp->tod = NULL;
tp->t_toe = NULL;
tp->t_flags &= ~TF_TOE;
toep->inp = NULL;
toep->flags &= ~TPF_ATTACHED;
if (in_pcbrele_wlocked(inp))
panic("%s: inp freed.", __func__);
mtx_lock(&td->toep_list_lock);
+ toep->flags &= ~TPF_IN_TOEP_LIST;
TAILQ_REMOVE(&td->toep_list, toep, link);
mtx_unlock(&td->toep_list_lock);
}
static void
release_offload_resources(struct toepcb *toep)
{
struct tom_data *td = toep->td;
struct adapter *sc = td_adapter(td);
int tid = toep->tid;
KASSERT(!(toep->flags & TPF_CPL_PENDING),
("%s: %p has CPL pending.", __func__, toep));
CTR5(KTR_CXGBE, "%s: toep %p (tid %d, l2te %p, ce %p)",
__func__, toep, tid, toep->l2te, toep->ce);
if (toep->l2te) {
t4_l2t_release(toep->l2te);
toep->l2te = NULL;
}
if (tid >= 0) {
remove_tid(sc, tid, toep->ce ? 2 : 1);
release_tid(sc, tid, toep->ctrlq);
toep->tid = -1;
}
if (toep->ce) {
t4_release_clip_entry(sc, toep->ce);
toep->ce = NULL;
}
if (toep->params.tc_idx != -1)
t4_release_cl_rl(sc, toep->vi->pi->port_id, toep->params.tc_idx);
}
/*
* Both the driver and kernel are done with the toepcb.
*/
static void
done_with_toepcb(struct toepcb *toep)
{
struct tom_data *td = toep->td;
KASSERT(!(toep->flags & TPF_CPL_PENDING),
("%s: %p has CPL pending.", __func__, toep));
KASSERT(!(toep->flags & TPF_ATTACHED),
("%s: %p is still attached.", __func__, toep));
CTR(KTR_CXGBE, "%s: toep %p (0x%x)", __func__, toep, toep->flags);
/*
* These queues should have been emptied at approximately the same time
* that a normal connection's socket's so_snd would have been purged or
* drained. Do _not_ clean up here.
*/
MPASS(mbufq_empty(&toep->ulp_pduq));
MPASS(mbufq_empty(&toep->ulp_pdu_reclaimq));
#ifdef INVARIANTS
if (ulp_mode(toep) == ULP_MODE_TCPDDP)
ddp_assert_empty(toep);
#endif
MPASS(TAILQ_EMPTY(&toep->aiotx_jobq));
MPASS(toep->tid == -1);
MPASS(toep->l2te == NULL);
MPASS(toep->ce == NULL);
mtx_lock(&td->toep_list_lock);
- TAILQ_REMOVE(&td->toep_list, toep, link);
+ if (toep->flags & TPF_IN_TOEP_LIST) {
+ toep->flags &= ~TPF_IN_TOEP_LIST;
+ TAILQ_REMOVE(&td->toep_list, toep, link);
+ }
mtx_unlock(&td->toep_list_lock);
free_toepcb(toep);
}
/*
* The kernel is done with the TCP PCB and this is our opportunity to unhook the
* toepcb hanging off of it. If the TOE driver is also done with the toepcb (no
* pending CPL) then it is time to release all resources tied to the toepcb.
*
* Also gets called when an offloaded active open fails and the TOM wants the
* kernel to take the TCP PCB back.
*/
void
t4_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp)
{
#if defined(KTR) || defined(INVARIANTS)
struct inpcb *inp = tptoinpcb(tp);
#endif
struct toepcb *toep = tp->t_toe;
INP_WLOCK_ASSERT(inp);
KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
KASSERT(toep->flags & TPF_ATTACHED,
("%s: not attached", __func__));
#ifdef KTR
if (tp->t_state == TCPS_SYN_SENT) {
CTR6(KTR_CXGBE, "%s: atid %d, toep %p (0x%x), inp %p (0x%x)",
__func__, toep->tid, toep, toep->flags, inp,
inp->inp_flags);
} else {
CTR6(KTR_CXGBE,
"t4_pcb_detach: tid %d (%s), toep %p (0x%x), inp %p (0x%x)",
toep->tid, tcpstates[tp->t_state], toep, toep->flags, inp,
inp->inp_flags);
}
#endif
tp->tod = NULL;
tp->t_toe = NULL;
tp->t_flags &= ~TF_TOE;
toep->flags &= ~TPF_ATTACHED;
if (!(toep->flags & TPF_CPL_PENDING))
done_with_toepcb(toep);
}
/*
* setsockopt handler.
*/
static void
t4_ctloutput(struct toedev *tod, struct tcpcb *tp, int dir, int name)
{
struct adapter *sc = tod->tod_softc;
struct toepcb *toep = tp->t_toe;
if (dir == SOPT_GET)
return;
CTR4(KTR_CXGBE, "%s: tp %p, dir %u, name %u", __func__, tp, dir, name);
switch (name) {
case TCP_NODELAY:
if (tp->t_state != TCPS_ESTABLISHED)
break;
toep->params.nagle = tp->t_flags & TF_NODELAY ? 0 : 1;
t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS,
V_TF_NAGLE(1), V_TF_NAGLE(toep->params.nagle), 0, 0);
break;
default:
break;
}
}
static inline uint64_t
get_tcb_tflags(const uint64_t *tcb)
{
return ((be64toh(tcb[14]) << 32) | (be64toh(tcb[15]) >> 32));
}
static inline uint32_t
get_tcb_field(const uint64_t *tcb, u_int word, uint32_t mask, u_int shift)
{
#define LAST_WORD ((TCB_SIZE / 4) - 1)
uint64_t t1, t2;
int flit_idx;
MPASS(mask != 0);
MPASS(word <= LAST_WORD);
MPASS(shift < 32);
flit_idx = (LAST_WORD - word) / 2;
if (word & 0x1)
shift += 32;
t1 = be64toh(tcb[flit_idx]) >> shift;
t2 = 0;
if (fls(mask) > 64 - shift) {
/*
* Will spill over into the next logical flit, which is the flit
* before this one. The flit_idx before this one must be valid.
*/
MPASS(flit_idx > 0);
t2 = be64toh(tcb[flit_idx - 1]) << (64 - shift);
}
return ((t2 | t1) & mask);
#undef LAST_WORD
}
#define GET_TCB_FIELD(tcb, F) \
get_tcb_field(tcb, W_TCB_##F, M_TCB_##F, S_TCB_##F)
/*
* Issues a CPL_GET_TCB to read the entire TCB for the tid.
*/
static int
send_get_tcb(struct adapter *sc, u_int tid)
{
struct cpl_get_tcb *cpl;
struct wrq_cookie cookie;
MPASS(tid >= sc->tids.tid_base);
MPASS(tid - sc->tids.tid_base < sc->tids.ntids);
cpl = start_wrq_wr(&sc->sge.ctrlq[0], howmany(sizeof(*cpl), 16),
&cookie);
if (__predict_false(cpl == NULL))
return (ENOMEM);
bzero(cpl, sizeof(*cpl));
INIT_TP_WR(cpl, tid);
OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_GET_TCB, tid));
cpl->reply_ctrl = htobe16(V_REPLY_CHAN(0) |
V_QUEUENO(sc->sge.ofld_rxq[0].iq.cntxt_id));
cpl->cookie = 0xff;
commit_wrq_wr(&sc->sge.ctrlq[0], cpl, &cookie);
return (0);
}
static struct tcb_histent *
alloc_tcb_histent(struct adapter *sc, u_int tid, int flags)
{
struct tcb_histent *te;
MPASS(flags == M_NOWAIT || flags == M_WAITOK);
te = malloc(sizeof(*te), M_CXGBE, M_ZERO | flags);
if (te == NULL)
return (NULL);
mtx_init(&te->te_lock, "TCB entry", NULL, MTX_DEF);
callout_init_mtx(&te->te_callout, &te->te_lock, 0);
te->te_adapter = sc;
te->te_tid = tid;
return (te);
}
static void
free_tcb_histent(struct tcb_histent *te)
{
mtx_destroy(&te->te_lock);
free(te, M_CXGBE);
}
/*
* Start tracking the tid in the TCB history.
*/
int
add_tid_to_history(struct adapter *sc, u_int tid)
{
struct tcb_histent *te = NULL;
struct tom_data *td = sc->tom_softc;
int rc;
MPASS(tid >= sc->tids.tid_base);
MPASS(tid - sc->tids.tid_base < sc->tids.ntids);
if (td->tcb_history == NULL)
return (ENXIO);
rw_wlock(&td->tcb_history_lock);
if (td->tcb_history[tid] != NULL) {
rc = EEXIST;
goto done;
}
te = alloc_tcb_histent(sc, tid, M_NOWAIT);
if (te == NULL) {
rc = ENOMEM;
goto done;
}
mtx_lock(&te->te_lock);
rc = send_get_tcb(sc, tid);
if (rc == 0) {
te->te_flags |= TE_RPL_PENDING;
td->tcb_history[tid] = te;
} else {
free(te, M_CXGBE);
}
mtx_unlock(&te->te_lock);
done:
rw_wunlock(&td->tcb_history_lock);
return (rc);
}
static void
remove_tcb_histent(struct tcb_histent *te)
{
struct adapter *sc = te->te_adapter;
struct tom_data *td = sc->tom_softc;
rw_assert(&td->tcb_history_lock, RA_WLOCKED);
mtx_assert(&te->te_lock, MA_OWNED);
MPASS(td->tcb_history[te->te_tid] == te);
td->tcb_history[te->te_tid] = NULL;
free_tcb_histent(te);
rw_wunlock(&td->tcb_history_lock);
}
static inline struct tcb_histent *
lookup_tcb_histent(struct adapter *sc, u_int tid, bool addrem)
{
struct tcb_histent *te;
struct tom_data *td = sc->tom_softc;
MPASS(tid >= sc->tids.tid_base);
MPASS(tid - sc->tids.tid_base < sc->tids.ntids);
if (td->tcb_history == NULL)
return (NULL);
if (addrem)
rw_wlock(&td->tcb_history_lock);
else
rw_rlock(&td->tcb_history_lock);
te = td->tcb_history[tid];
if (te != NULL) {
mtx_lock(&te->te_lock);
return (te); /* with both locks held */
}
if (addrem)
rw_wunlock(&td->tcb_history_lock);
else
rw_runlock(&td->tcb_history_lock);
return (te);
}
static inline void
release_tcb_histent(struct tcb_histent *te)
{
struct adapter *sc = te->te_adapter;
struct tom_data *td = sc->tom_softc;
mtx_assert(&te->te_lock, MA_OWNED);
mtx_unlock(&te->te_lock);
rw_assert(&td->tcb_history_lock, RA_RLOCKED);
rw_runlock(&td->tcb_history_lock);
}
static void
request_tcb(void *arg)
{
struct tcb_histent *te = arg;
mtx_assert(&te->te_lock, MA_OWNED);
/* Noone else is supposed to update the histent. */
MPASS(!(te->te_flags & TE_RPL_PENDING));
if (send_get_tcb(te->te_adapter, te->te_tid) == 0)
te->te_flags |= TE_RPL_PENDING;
else
callout_schedule(&te->te_callout, hz / 100);
}
static void
update_tcb_histent(struct tcb_histent *te, const uint64_t *tcb)
{
struct tom_data *td = te->te_adapter->tom_softc;
uint64_t tflags = get_tcb_tflags(tcb);
uint8_t sample = 0;
if (GET_TCB_FIELD(tcb, SND_MAX_RAW) != GET_TCB_FIELD(tcb, SND_UNA_RAW)) {
if (GET_TCB_FIELD(tcb, T_RXTSHIFT) != 0)
sample |= TS_RTO;
if (GET_TCB_FIELD(tcb, T_DUPACKS) != 0)
sample |= TS_DUPACKS;
if (GET_TCB_FIELD(tcb, T_DUPACKS) >= td->dupack_threshold)
sample |= TS_FASTREXMT;
}
if (GET_TCB_FIELD(tcb, SND_MAX_RAW) != 0) {
uint32_t snd_wnd;
sample |= TS_SND_BACKLOGGED; /* for whatever reason. */
snd_wnd = GET_TCB_FIELD(tcb, RCV_ADV);
if (tflags & V_TF_RECV_SCALE(1))
snd_wnd <<= GET_TCB_FIELD(tcb, RCV_SCALE);
if (GET_TCB_FIELD(tcb, SND_CWND) < snd_wnd)
sample |= TS_CWND_LIMITED; /* maybe due to CWND */
}
if (tflags & V_TF_CCTRL_ECN(1)) {
/*
* CE marker on incoming IP hdr, echoing ECE back in the TCP
* hdr. Indicates congestion somewhere on the way from the peer
* to this node.
*/
if (tflags & V_TF_CCTRL_ECE(1))
sample |= TS_ECN_ECE;
/*
* ECE seen and CWR sent (or about to be sent). Might indicate
* congestion on the way to the peer. This node is reducing its
* congestion window in response.
*/
if (tflags & (V_TF_CCTRL_CWR(1) | V_TF_CCTRL_RFR(1)))
sample |= TS_ECN_CWR;
}
te->te_sample[te->te_pidx] = sample;
if (++te->te_pidx == nitems(te->te_sample))
te->te_pidx = 0;
memcpy(te->te_tcb, tcb, TCB_SIZE);
te->te_flags |= TE_ACTIVE;
}
static int
do_get_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
{
struct adapter *sc = iq->adapter;
const struct cpl_get_tcb_rpl *cpl = mtod(m, const void *);
const uint64_t *tcb = (const uint64_t *)(const void *)(cpl + 1);
struct tcb_histent *te;
const u_int tid = GET_TID(cpl);
bool remove;
remove = GET_TCB_FIELD(tcb, T_STATE) == TCPS_CLOSED;
te = lookup_tcb_histent(sc, tid, remove);
if (te == NULL) {
/* Not in the history. Who issued the GET_TCB for this? */
device_printf(sc->dev, "tcb %u: flags 0x%016jx, state %u, "
"srtt %u, sscale %u, rscale %u, cookie 0x%x\n", tid,
(uintmax_t)get_tcb_tflags(tcb), GET_TCB_FIELD(tcb, T_STATE),
GET_TCB_FIELD(tcb, T_SRTT), GET_TCB_FIELD(tcb, SND_SCALE),
GET_TCB_FIELD(tcb, RCV_SCALE), cpl->cookie);
goto done;
}
MPASS(te->te_flags & TE_RPL_PENDING);
te->te_flags &= ~TE_RPL_PENDING;
if (remove) {
remove_tcb_histent(te);
} else {
update_tcb_histent(te, tcb);
callout_reset(&te->te_callout, hz / 10, request_tcb, te);
release_tcb_histent(te);
}
done:
m_freem(m);
return (0);
}
static void
fill_tcp_info_from_tcb(struct adapter *sc, uint64_t *tcb, struct tcp_info *ti)
{
uint32_t v;
ti->tcpi_state = GET_TCB_FIELD(tcb, T_STATE);
v = GET_TCB_FIELD(tcb, T_SRTT);
ti->tcpi_rtt = tcp_ticks_to_us(sc, v);
v = GET_TCB_FIELD(tcb, T_RTTVAR);
ti->tcpi_rttvar = tcp_ticks_to_us(sc, v);
ti->tcpi_snd_ssthresh = GET_TCB_FIELD(tcb, SND_SSTHRESH);
ti->tcpi_snd_cwnd = GET_TCB_FIELD(tcb, SND_CWND);
ti->tcpi_rcv_nxt = GET_TCB_FIELD(tcb, RCV_NXT);
ti->tcpi_rcv_adv = GET_TCB_FIELD(tcb, RCV_ADV);
ti->tcpi_dupacks = GET_TCB_FIELD(tcb, T_DUPACKS);
v = GET_TCB_FIELD(tcb, TX_MAX);
ti->tcpi_snd_nxt = v - GET_TCB_FIELD(tcb, SND_NXT_RAW);
ti->tcpi_snd_una = v - GET_TCB_FIELD(tcb, SND_UNA_RAW);
ti->tcpi_snd_max = v - GET_TCB_FIELD(tcb, SND_MAX_RAW);
/* Receive window being advertised by us. */
ti->tcpi_rcv_wscale = GET_TCB_FIELD(tcb, SND_SCALE); /* Yes, SND. */
ti->tcpi_rcv_space = GET_TCB_FIELD(tcb, RCV_WND);
/* Send window */
ti->tcpi_snd_wscale = GET_TCB_FIELD(tcb, RCV_SCALE); /* Yes, RCV. */
ti->tcpi_snd_wnd = GET_TCB_FIELD(tcb, RCV_ADV);
if (get_tcb_tflags(tcb) & V_TF_RECV_SCALE(1))
ti->tcpi_snd_wnd <<= ti->tcpi_snd_wscale;
else
ti->tcpi_snd_wscale = 0;
}
static void
fill_tcp_info_from_history(struct adapter *sc, struct tcb_histent *te,
struct tcp_info *ti)
{
fill_tcp_info_from_tcb(sc, te->te_tcb, ti);
}
/*
* Reads the TCB for the given tid using a memory window and copies it to 'buf'
* in the same format as CPL_GET_TCB_RPL.
*/
static void
read_tcb_using_memwin(struct adapter *sc, u_int tid, uint64_t *buf)
{
int i, j, k, rc;
uint32_t addr;
u_char *tcb, tmp;
MPASS(tid >= sc->tids.tid_base);
MPASS(tid - sc->tids.tid_base < sc->tids.ntids);
addr = t4_read_reg(sc, A_TP_CMM_TCB_BASE) + tid * TCB_SIZE;
rc = read_via_memwin(sc, 2, addr, (uint32_t *)buf, TCB_SIZE);
if (rc != 0)
return;
tcb = (u_char *)buf;
for (i = 0, j = TCB_SIZE - 16; i < j; i += 16, j -= 16) {
for (k = 0; k < 16; k++) {
tmp = tcb[i + k];
tcb[i + k] = tcb[j + k];
tcb[j + k] = tmp;
}
}
}
static void
fill_tcp_info(struct adapter *sc, u_int tid, struct tcp_info *ti)
{
uint64_t tcb[TCB_SIZE / sizeof(uint64_t)];
struct tcb_histent *te;
ti->tcpi_toe_tid = tid;
te = lookup_tcb_histent(sc, tid, false);
if (te != NULL) {
fill_tcp_info_from_history(sc, te, ti);
release_tcb_histent(te);
} else {
if (!(sc->debug_flags & DF_DISABLE_TCB_CACHE)) {
/* XXX: tell firmware to flush TCB cache. */
}
read_tcb_using_memwin(sc, tid, tcb);
fill_tcp_info_from_tcb(sc, tcb, ti);
}
}
/*
* Called by the kernel to allow the TOE driver to "refine" values filled up in
* the tcp_info for an offloaded connection.
*/
static void
t4_tcp_info(struct toedev *tod, const struct tcpcb *tp, struct tcp_info *ti)
{
struct adapter *sc = tod->tod_softc;
struct toepcb *toep = tp->t_toe;
INP_LOCK_ASSERT(tptoinpcb(tp));
MPASS(ti != NULL);
fill_tcp_info(sc, toep->tid, ti);
}
#ifdef KERN_TLS
static int
t4_alloc_tls_session(struct toedev *tod, struct tcpcb *tp,
struct ktls_session *tls, int direction)
{
struct toepcb *toep = tp->t_toe;
INP_WLOCK_ASSERT(tptoinpcb(tp));
MPASS(tls != NULL);
return (tls_alloc_ktls(toep, tls, direction));
}
#endif
static void
send_mss_flowc_wr(struct adapter *sc, struct toepcb *toep)
{
struct wrq_cookie cookie;
struct fw_flowc_wr *flowc;
struct ofld_tx_sdesc *txsd;
const int flowclen = sizeof(*flowc) + sizeof(struct fw_flowc_mnemval);
const int flowclen16 = howmany(flowclen, 16);
if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0) {
CH_ERR(sc, "%s: tid %u out of tx credits (%d, %d).\n", __func__,
toep->tid, toep->tx_credits, toep->txsd_avail);
return;
}
flowc = start_wrq_wr(&toep->ofld_txq->wrq, flowclen16, &cookie);
if (__predict_false(flowc == NULL)) {
CH_ERR(sc, "ENOMEM in %s for tid %u.\n", __func__, toep->tid);
return;
}
flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
V_FW_FLOWC_WR_NPARAMS(1));
flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) |
V_FW_WR_FLOWID(toep->tid));
flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_MSS;
flowc->mnemval[0].val = htobe32(toep->params.emss);
txsd = &toep->txsd[toep->txsd_pidx];
txsd->tx_credits = flowclen16;
txsd->plen = 0;
toep->tx_credits -= txsd->tx_credits;
if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
toep->txsd_pidx = 0;
toep->txsd_avail--;
commit_wrq_wr(&toep->ofld_txq->wrq, flowc, &cookie);
}
static void
t4_pmtu_update(struct toedev *tod, struct tcpcb *tp, tcp_seq seq, int mtu)
{
struct work_request_hdr *wrh;
struct ulp_txpkt *ulpmc;
int idx, len;
struct wrq_cookie cookie;
struct inpcb *inp = tptoinpcb(tp);
struct toepcb *toep = tp->t_toe;
struct adapter *sc = td_adapter(toep->td);
unsigned short *mtus = &sc->params.mtus[0];
INP_WLOCK_ASSERT(inp);
MPASS(mtu > 0); /* kernel is supposed to provide something usable. */
/* tp->snd_una and snd_max are in host byte order too. */
seq = be32toh(seq);
CTR6(KTR_CXGBE, "%s: tid %d, seq 0x%08x, mtu %u, mtu_idx %u (%d)",
__func__, toep->tid, seq, mtu, toep->params.mtu_idx,
mtus[toep->params.mtu_idx]);
if (ulp_mode(toep) == ULP_MODE_NONE && /* XXX: Read TCB otherwise? */
(SEQ_LT(seq, tp->snd_una) || SEQ_GEQ(seq, tp->snd_max))) {
CTR5(KTR_CXGBE,
"%s: tid %d, seq 0x%08x not in range [0x%08x, 0x%08x).",
__func__, toep->tid, seq, tp->snd_una, tp->snd_max);
return;
}
/* Find the best mtu_idx for the suggested MTU. */
for (idx = 0; idx < NMTUS - 1 && mtus[idx + 1] <= mtu; idx++)
continue;
if (idx >= toep->params.mtu_idx)
return; /* Never increase the PMTU (just like the kernel). */
/*
* We'll send a compound work request with 2 SET_TCB_FIELDs -- the first
* one updates the mtu_idx and the second one triggers a retransmit.
*/
len = sizeof(*wrh) + 2 * roundup2(LEN__SET_TCB_FIELD_ULP, 16);
wrh = start_wrq_wr(toep->ctrlq, howmany(len, 16), &cookie);
if (wrh == NULL) {
CH_ERR(sc, "failed to change mtu_idx of tid %d (%u -> %u).\n",
toep->tid, toep->params.mtu_idx, idx);
return;
}
INIT_ULPTX_WRH(wrh, len, 1, 0); /* atomic */
ulpmc = (struct ulp_txpkt *)(wrh + 1);
ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_T_MAXSEG,
V_TCB_T_MAXSEG(M_TCB_T_MAXSEG), V_TCB_T_MAXSEG(idx));
ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_TIMESTAMP,
V_TCB_TIMESTAMP(0x7FFFFULL << 11), 0);
commit_wrq_wr(toep->ctrlq, wrh, &cookie);
/* Update the software toepcb and tcpcb. */
toep->params.mtu_idx = idx;
tp->t_maxseg = mtus[toep->params.mtu_idx];
if (inp->inp_inc.inc_flags & INC_ISIPV6)
tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
else
tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr);
toep->params.emss = tp->t_maxseg;
if (tp->t_flags & TF_RCVD_TSTMP)
toep->params.emss -= TCPOLEN_TSTAMP_APPA;
/* Update the firmware flowc. */
send_mss_flowc_wr(sc, toep);
/* Update the MTU in the kernel's hostcache. */
if (sc->tt.update_hc_on_pmtu_change != 0) {
struct in_conninfo inc = {0};
inc.inc_fibnum = inp->inp_inc.inc_fibnum;
if (inp->inp_inc.inc_flags & INC_ISIPV6) {
inc.inc_flags |= INC_ISIPV6;
inc.inc6_faddr = inp->inp_inc.inc6_faddr;
} else {
inc.inc_faddr = inp->inp_inc.inc_faddr;
}
tcp_hc_updatemtu(&inc, mtu);
}
CTR6(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u), t_maxseg %u, emss %u",
__func__, toep->tid, toep->params.mtu_idx,
mtus[toep->params.mtu_idx], tp->t_maxseg, toep->params.emss);
}
/*
* The TOE driver will not receive any more CPLs for the tid associated with the
* toepcb; release the hold on the inpcb.
*/
void
final_cpl_received(struct toepcb *toep)
{
struct inpcb *inp = toep->inp;
bool need_wakeup;
KASSERT(inp != NULL, ("%s: inp is NULL", __func__));
INP_WLOCK_ASSERT(inp);
KASSERT(toep->flags & TPF_CPL_PENDING,
("%s: CPL not pending already?", __func__));
CTR6(KTR_CXGBE, "%s: tid %d, toep %p (0x%x), inp %p (0x%x)",
__func__, toep->tid, toep, toep->flags, inp, inp->inp_flags);
if (ulp_mode(toep) == ULP_MODE_TCPDDP)
release_ddp_resources(toep);
toep->inp = NULL;
need_wakeup = (toep->flags & TPF_WAITING_FOR_FINAL) != 0;
toep->flags &= ~(TPF_CPL_PENDING | TPF_WAITING_FOR_FINAL);
mbufq_drain(&toep->ulp_pduq);
mbufq_drain(&toep->ulp_pdu_reclaimq);
release_offload_resources(toep);
if (!(toep->flags & TPF_ATTACHED))
done_with_toepcb(toep);
if (!in_pcbrele_wlocked(inp))
INP_WUNLOCK(inp);
if (need_wakeup) {
struct mtx *lock = mtx_pool_find(mtxpool_sleep, toep);
mtx_lock(lock);
wakeup(toep);
mtx_unlock(lock);
}
}
void
insert_tid(struct adapter *sc, int tid, void *ctx, int ntids)
{
struct tid_info *t = &sc->tids;
MPASS(tid >= t->tid_base);
MPASS(tid - t->tid_base < t->ntids);
t->tid_tab[tid - t->tid_base] = ctx;
atomic_add_int(&t->tids_in_use, ntids);
}
void *
lookup_tid(struct adapter *sc, int tid)
{
struct tid_info *t = &sc->tids;
return (t->tid_tab[tid - t->tid_base]);
}
void
update_tid(struct adapter *sc, int tid, void *ctx)
{
struct tid_info *t = &sc->tids;
t->tid_tab[tid - t->tid_base] = ctx;
}
void
remove_tid(struct adapter *sc, int tid, int ntids)
{
struct tid_info *t = &sc->tids;
t->tid_tab[tid - t->tid_base] = NULL;
atomic_subtract_int(&t->tids_in_use, ntids);
}
/*
* What mtu_idx to use, given a 4-tuple. Note that both s->mss and tcp_mssopt
* have the MSS that we should advertise in our SYN. Advertised MSS doesn't
* account for any TCP options so the effective MSS (only payload, no headers or
* options) could be different.
*/
static int
find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc,
struct offload_settings *s)
{
unsigned short *mtus = &sc->params.mtus[0];
int i, mss, mtu;
MPASS(inc != NULL);
mss = s->mss > 0 ? s->mss : tcp_mssopt(inc);
if (inc->inc_flags & INC_ISIPV6)
mtu = mss + sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
else
mtu = mss + sizeof(struct ip) + sizeof(struct tcphdr);
for (i = 0; i < NMTUS - 1 && mtus[i + 1] <= mtu; i++)
continue;
return (i);
}
/*
* Determine the receive window size for a socket.
*/
u_long
select_rcv_wnd(struct socket *so)
{
unsigned long wnd;
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
wnd = sbspace(&so->so_rcv);
if (wnd < MIN_RCV_WND)
wnd = MIN_RCV_WND;
return min(wnd, MAX_RCV_WND);
}
int
select_rcv_wscale(void)
{
int wscale = 0;
unsigned long space = sb_max;
if (space > MAX_RCV_WND)
space = MAX_RCV_WND;
while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space)
wscale++;
return (wscale);
}
__be64
calc_options0(struct vi_info *vi, struct conn_params *cp)
{
uint64_t opt0 = 0;
opt0 |= F_TCAM_BYPASS;
MPASS(cp->wscale >= 0 && cp->wscale <= M_WND_SCALE);
opt0 |= V_WND_SCALE(cp->wscale);
MPASS(cp->mtu_idx >= 0 && cp->mtu_idx < NMTUS);
opt0 |= V_MSS_IDX(cp->mtu_idx);
MPASS(cp->ulp_mode >= 0 && cp->ulp_mode <= M_ULP_MODE);
opt0 |= V_ULP_MODE(cp->ulp_mode);
MPASS(cp->opt0_bufsize >= 0 && cp->opt0_bufsize <= M_RCV_BUFSIZ);
opt0 |= V_RCV_BUFSIZ(cp->opt0_bufsize);
MPASS(cp->l2t_idx >= 0 && cp->l2t_idx < vi->adapter->vres.l2t.size);
opt0 |= V_L2T_IDX(cp->l2t_idx);
opt0 |= V_SMAC_SEL(vi->smt_idx);
opt0 |= V_TX_CHAN(vi->pi->tx_chan);
MPASS(cp->keepalive == 0 || cp->keepalive == 1);
opt0 |= V_KEEP_ALIVE(cp->keepalive);
MPASS(cp->nagle == 0 || cp->nagle == 1);
opt0 |= V_NAGLE(cp->nagle);
return (htobe64(opt0));
}
__be32
calc_options2(struct vi_info *vi, struct conn_params *cp)
{
uint32_t opt2 = 0;
struct port_info *pi = vi->pi;
struct adapter *sc = pi->adapter;
/*
* rx flow control, rx coalesce, congestion control, and tx pace are all
* explicitly set by the driver. On T5+ the ISS is also set by the
* driver to the value picked by the kernel.
*/
if (is_t4(sc)) {
opt2 |= F_RX_FC_VALID | F_RX_COALESCE_VALID;
opt2 |= F_CONG_CNTRL_VALID | F_PACE_VALID;
} else {
opt2 |= F_T5_OPT_2_VALID; /* all 4 valid */
opt2 |= F_T5_ISS; /* ISS provided in CPL */
}
MPASS(cp->sack == 0 || cp->sack == 1);
opt2 |= V_SACK_EN(cp->sack);
MPASS(cp->tstamp == 0 || cp->tstamp == 1);
opt2 |= V_TSTAMPS_EN(cp->tstamp);
if (cp->wscale > 0)
opt2 |= F_WND_SCALE_EN;
MPASS(cp->ecn == 0 || cp->ecn == 1);
opt2 |= V_CCTRL_ECN(cp->ecn);
opt2 |= V_TX_QUEUE(TX_MODQ(pi->tx_chan));
opt2 |= V_PACE(0);
opt2 |= F_RSS_QUEUE_VALID;
opt2 |= V_RSS_QUEUE(sc->sge.ofld_rxq[cp->rxq_idx].iq.abs_id);
if (chip_id(sc) <= CHELSIO_T6) {
MPASS(pi->rx_chan == 0 || pi->rx_chan == 1);
opt2 |= V_RX_CHANNEL(pi->rx_chan);
}
MPASS(cp->cong_algo >= 0 && cp->cong_algo <= M_CONG_CNTRL);
opt2 |= V_CONG_CNTRL(cp->cong_algo);
MPASS(cp->rx_coalesce == 0 || cp->rx_coalesce == 1);
if (cp->rx_coalesce == 1)
opt2 |= V_RX_COALESCE(M_RX_COALESCE);
opt2 |= V_RX_FC_DDP(0) | V_RX_FC_DISABLE(0);
MPASS(cp->ulp_mode != ULP_MODE_TCPDDP);
return (htobe32(opt2));
}
uint64_t
select_ntuple(struct vi_info *vi, struct l2t_entry *e)
{
struct adapter *sc = vi->adapter;
struct tp_params *tp = &sc->params.tp;
uint64_t ntuple = 0;
/*
* Initialize each of the fields which we care about which are present
* in the Compressed Filter Tuple.
*/
if (tp->vlan_shift >= 0 && EVL_VLANOFTAG(e->vlan) != CPL_L2T_VLAN_NONE)
ntuple |= (uint64_t)(F_FT_VLAN_VLD | e->vlan) << tp->vlan_shift;
if (tp->port_shift >= 0)
ntuple |= (uint64_t)e->lport << tp->port_shift;
if (tp->protocol_shift >= 0)
ntuple |= (uint64_t)IPPROTO_TCP << tp->protocol_shift;
if (tp->vnic_shift >= 0 && tp->vnic_mode == FW_VNIC_MODE_PF_VF) {
ntuple |= (uint64_t)(V_FT_VNID_ID_VF(vi->vin) |
V_FT_VNID_ID_PF(sc->pf) | V_FT_VNID_ID_VLD(vi->vfvld)) <<
tp->vnic_shift;
}
if (is_t4(sc))
return (htobe32((uint32_t)ntuple));
else
return (htobe64(V_FILTER_TUPLE(ntuple)));
}
/*
* Initialize various connection parameters.
*/
void
init_conn_params(struct vi_info *vi , struct offload_settings *s,
struct in_conninfo *inc, struct socket *so,
const struct tcp_options *tcpopt, int16_t l2t_idx, struct conn_params *cp)
{
struct port_info *pi = vi->pi;
struct adapter *sc = pi->adapter;
struct tom_tunables *tt = &sc->tt;
struct inpcb *inp = sotoinpcb(so);
struct tcpcb *tp = intotcpcb(inp);
u_long wnd;
u_int q_idx;
MPASS(s->offload != 0);
/* Congestion control algorithm */
if (s->cong_algo >= 0)
cp->cong_algo = s->cong_algo & M_CONG_CNTRL;
else if (sc->tt.cong_algorithm >= 0)
cp->cong_algo = tt->cong_algorithm & M_CONG_CNTRL;
else {
struct cc_algo *cc = CC_ALGO(tp);
if (strcasecmp(cc->name, "reno") == 0)
cp->cong_algo = CONG_ALG_RENO;
else if (strcasecmp(cc->name, "tahoe") == 0)
cp->cong_algo = CONG_ALG_TAHOE;
if (strcasecmp(cc->name, "newreno") == 0)
cp->cong_algo = CONG_ALG_NEWRENO;
if (strcasecmp(cc->name, "highspeed") == 0)
cp->cong_algo = CONG_ALG_HIGHSPEED;
else {
/*
* Use newreno in case the algorithm selected by the
* host stack is not supported by the hardware.
*/
cp->cong_algo = CONG_ALG_NEWRENO;
}
}
/* Tx traffic scheduling class. */
if (s->sched_class >= 0 && s->sched_class < sc->params.nsched_cls)
cp->tc_idx = s->sched_class;
else
cp->tc_idx = -1;
/* Nagle's algorithm. */
if (s->nagle >= 0)
cp->nagle = s->nagle > 0 ? 1 : 0;
else
cp->nagle = tp->t_flags & TF_NODELAY ? 0 : 1;
/* TCP Keepalive. */
if (V_tcp_always_keepalive || so_options_get(so) & SO_KEEPALIVE)
cp->keepalive = 1;
else
cp->keepalive = 0;
/* Optimization that's specific to T5 @ 40G. */
if (tt->tx_align >= 0)
cp->tx_align = tt->tx_align > 0 ? 1 : 0;
else if (chip_id(sc) == CHELSIO_T5 &&
(port_top_speed(pi) > 10 || sc->params.nports > 2))
cp->tx_align = 1;
else
cp->tx_align = 0;
/* ULP mode. */
cp->ulp_mode = ULP_MODE_NONE;
/* Rx coalescing. */
if (s->rx_coalesce >= 0)
cp->rx_coalesce = s->rx_coalesce > 0 ? 1 : 0;
else if (tt->rx_coalesce >= 0)
cp->rx_coalesce = tt->rx_coalesce > 0 ? 1 : 0;
else
cp->rx_coalesce = 1; /* default */
/*
* Index in the PMTU table. This controls the MSS that we announce in
* our SYN initially, but after ESTABLISHED it controls the MSS that we
* use to send data.
*/
cp->mtu_idx = find_best_mtu_idx(sc, inc, s);
/* Tx queue for this connection. */
if (s->txq == QUEUE_RANDOM)
q_idx = arc4random();
else if (s->txq == QUEUE_ROUNDROBIN)
q_idx = atomic_fetchadd_int(&vi->txq_rr, 1);
else
q_idx = s->txq;
cp->txq_idx = vi->first_ofld_txq + q_idx % vi->nofldtxq;
/* Rx queue for this connection. */
if (s->rxq == QUEUE_RANDOM)
q_idx = arc4random();
else if (s->rxq == QUEUE_ROUNDROBIN)
q_idx = atomic_fetchadd_int(&vi->rxq_rr, 1);
else
q_idx = s->rxq;
cp->rxq_idx = vi->first_ofld_rxq + q_idx % vi->nofldrxq;
if (SOLISTENING(so)) {
/* Passive open */
MPASS(tcpopt != NULL);
/* TCP timestamp option */
if (tcpopt->tstamp &&
(s->tstamp > 0 || (s->tstamp < 0 && V_tcp_do_rfc1323)))
cp->tstamp = 1;
else
cp->tstamp = 0;
/* SACK */
if (tcpopt->sack &&
(s->sack > 0 || (s->sack < 0 && V_tcp_do_sack)))
cp->sack = 1;
else
cp->sack = 0;
/* Receive window scaling. */
if (tcpopt->wsf > 0 && tcpopt->wsf < 15 && V_tcp_do_rfc1323)
cp->wscale = select_rcv_wscale();
else
cp->wscale = 0;
/* ECN */
if (tcpopt->ecn && /* XXX: review. */
(s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn)))
cp->ecn = 1;
else
cp->ecn = 0;
wnd = max(so->sol_sbrcv_hiwat, MIN_RCV_WND);
cp->opt0_bufsize = min(wnd >> 10, M_RCV_BUFSIZ);
if (tt->sndbuf > 0)
cp->sndbuf = tt->sndbuf;
else if (so->sol_sbsnd_flags & SB_AUTOSIZE &&
V_tcp_do_autosndbuf)
cp->sndbuf = 256 * 1024;
else
cp->sndbuf = so->sol_sbsnd_hiwat;
} else {
/* Active open */
/* TCP timestamp option */
if (s->tstamp > 0 ||
(s->tstamp < 0 && (tp->t_flags & TF_REQ_TSTMP)))
cp->tstamp = 1;
else
cp->tstamp = 0;
/* SACK */
if (s->sack > 0 ||
(s->sack < 0 && (tp->t_flags & TF_SACK_PERMIT)))
cp->sack = 1;
else
cp->sack = 0;
/* Receive window scaling */
if (tp->t_flags & TF_REQ_SCALE)
cp->wscale = select_rcv_wscale();
else
cp->wscale = 0;
/* ECN */
if (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn == 1))
cp->ecn = 1;
else
cp->ecn = 0;
SOCKBUF_LOCK(&so->so_rcv);
wnd = max(select_rcv_wnd(so), MIN_RCV_WND);
SOCKBUF_UNLOCK(&so->so_rcv);
cp->opt0_bufsize = min(wnd >> 10, M_RCV_BUFSIZ);
if (tt->sndbuf > 0)
cp->sndbuf = tt->sndbuf;
else {
SOCKBUF_LOCK(&so->so_snd);
if (so->so_snd.sb_flags & SB_AUTOSIZE &&
V_tcp_do_autosndbuf)
cp->sndbuf = 256 * 1024;
else
cp->sndbuf = so->so_snd.sb_hiwat;
SOCKBUF_UNLOCK(&so->so_snd);
}
}
cp->l2t_idx = l2t_idx;
/* This will be initialized on ESTABLISHED. */
cp->emss = 0;
}
int
negative_advice(int status)
{
return (status == CPL_ERR_RTX_NEG_ADVICE ||
status == CPL_ERR_PERSIST_NEG_ADVICE ||
status == CPL_ERR_KEEPALV_NEG_ADVICE);
}
static int
-alloc_tid_tab(struct tid_info *t, int flags)
+alloc_tid_tab(struct adapter *sc)
{
+ struct tid_info *t = &sc->tids;
MPASS(t->ntids > 0);
MPASS(t->tid_tab == NULL);
t->tid_tab = malloc(t->ntids * sizeof(*t->tid_tab), M_CXGBE,
- M_ZERO | flags);
+ M_ZERO | M_NOWAIT);
if (t->tid_tab == NULL)
return (ENOMEM);
atomic_store_rel_int(&t->tids_in_use, 0);
return (0);
}
static void
-free_tid_tab(struct tid_info *t)
+free_tid_tab(struct adapter *sc)
{
+ struct tid_info *t = &sc->tids;
KASSERT(t->tids_in_use == 0,
("%s: %d tids still in use.", __func__, t->tids_in_use));
free(t->tid_tab, M_CXGBE);
t->tid_tab = NULL;
}
-static int
-alloc_stid_tab(struct tid_info *t, int flags)
-{
-
- MPASS(t->nstids > 0);
- MPASS(t->stid_tab == NULL);
-
- t->stid_tab = malloc(t->nstids * sizeof(*t->stid_tab), M_CXGBE,
- M_ZERO | flags);
- if (t->stid_tab == NULL)
- return (ENOMEM);
- mtx_init(&t->stid_lock, "stid lock", NULL, MTX_DEF);
- t->stids_in_use = 0;
- TAILQ_INIT(&t->stids);
- t->nstids_free_head = t->nstids;
-
- return (0);
-}
-
static void
-free_stid_tab(struct tid_info *t)
+free_tid_tabs(struct adapter *sc)
{
-
- KASSERT(t->stids_in_use == 0,
- ("%s: %d tids still in use.", __func__, t->stids_in_use));
-
- if (mtx_initialized(&t->stid_lock))
- mtx_destroy(&t->stid_lock);
- free(t->stid_tab, M_CXGBE);
- t->stid_tab = NULL;
-}
-
-static void
-free_tid_tabs(struct tid_info *t)
-{
-
- free_tid_tab(t);
- free_stid_tab(t);
+ free_tid_tab(sc);
+ free_stid_tab(sc);
}
static int
-alloc_tid_tabs(struct tid_info *t)
+alloc_tid_tabs(struct adapter *sc)
{
int rc;
- rc = alloc_tid_tab(t, M_NOWAIT);
+ rc = alloc_tid_tab(sc);
if (rc != 0)
goto failed;
- rc = alloc_stid_tab(t, M_NOWAIT);
+ rc = alloc_stid_tab(sc);
if (rc != 0)
goto failed;
return (0);
failed:
- free_tid_tabs(t);
+ free_tid_tabs(sc);
return (rc);
}
static inline void
alloc_tcb_history(struct adapter *sc, struct tom_data *td)
{
if (sc->tids.ntids == 0 || sc->tids.ntids > 1024)
return;
rw_init(&td->tcb_history_lock, "TCB history");
td->tcb_history = malloc(sc->tids.ntids * sizeof(*td->tcb_history),
M_CXGBE, M_ZERO | M_NOWAIT);
td->dupack_threshold = G_DUPACKTHRESH(t4_read_reg(sc, A_TP_PARA_REG0));
}
static inline void
free_tcb_history(struct adapter *sc, struct tom_data *td)
{
#ifdef INVARIANTS
int i;
if (td->tcb_history != NULL) {
for (i = 0; i < sc->tids.ntids; i++) {
MPASS(td->tcb_history[i] == NULL);
}
}
#endif
free(td->tcb_history, M_CXGBE);
if (rw_initialized(&td->tcb_history_lock))
rw_destroy(&td->tcb_history_lock);
}
static void
free_tom_data(struct adapter *sc, struct tom_data *td)
{
ASSERT_SYNCHRONIZED_OP(sc);
KASSERT(TAILQ_EMPTY(&td->toep_list),
("%s: TOE PCB list is not empty.", __func__));
KASSERT(td->lctx_count == 0,
("%s: lctx hash table is not empty.", __func__));
t4_free_ppod_region(&td->pr);
if (td->listen_mask != 0)
hashdestroy(td->listen_hash, M_CXGBE, td->listen_mask);
if (mtx_initialized(&td->unsent_wr_lock))
mtx_destroy(&td->unsent_wr_lock);
if (mtx_initialized(&td->lctx_hash_lock))
mtx_destroy(&td->lctx_hash_lock);
if (mtx_initialized(&td->toep_list_lock))
mtx_destroy(&td->toep_list_lock);
free_tcb_history(sc, td);
- free_tid_tabs(&sc->tids);
+ free_tid_tabs(sc);
free(td, M_CXGBE);
}
static char *
prepare_pkt(int open_type, uint16_t vtag, struct inpcb *inp, int *pktlen,
int *buflen)
{
char *pkt;
struct tcphdr *th;
int ipv6, len;
const int maxlen =
max(sizeof(struct ether_header), sizeof(struct ether_vlan_header)) +
max(sizeof(struct ip), sizeof(struct ip6_hdr)) +
sizeof(struct tcphdr);
MPASS(open_type == OPEN_TYPE_ACTIVE || open_type == OPEN_TYPE_LISTEN);
pkt = malloc(maxlen, M_CXGBE, M_ZERO | M_NOWAIT);
if (pkt == NULL)
return (NULL);
ipv6 = inp->inp_vflag & INP_IPV6;
len = 0;
if (EVL_VLANOFTAG(vtag) == 0xfff) {
struct ether_header *eh = (void *)pkt;
if (ipv6)
eh->ether_type = htons(ETHERTYPE_IPV6);
else
eh->ether_type = htons(ETHERTYPE_IP);
len += sizeof(*eh);
} else {
struct ether_vlan_header *evh = (void *)pkt;
evh->evl_encap_proto = htons(ETHERTYPE_VLAN);
evh->evl_tag = htons(vtag);
if (ipv6)
evh->evl_proto = htons(ETHERTYPE_IPV6);
else
evh->evl_proto = htons(ETHERTYPE_IP);
len += sizeof(*evh);
}
if (ipv6) {
struct ip6_hdr *ip6 = (void *)&pkt[len];
ip6->ip6_vfc = IPV6_VERSION;
ip6->ip6_plen = htons(sizeof(struct tcphdr));
ip6->ip6_nxt = IPPROTO_TCP;
if (open_type == OPEN_TYPE_ACTIVE) {
ip6->ip6_src = inp->in6p_laddr;
ip6->ip6_dst = inp->in6p_faddr;
} else if (open_type == OPEN_TYPE_LISTEN) {
ip6->ip6_src = inp->in6p_laddr;
ip6->ip6_dst = ip6->ip6_src;
}
len += sizeof(*ip6);
} else {
struct ip *ip = (void *)&pkt[len];
ip->ip_v = IPVERSION;
ip->ip_hl = sizeof(*ip) >> 2;
ip->ip_tos = inp->inp_ip_tos;
ip->ip_len = htons(sizeof(struct ip) + sizeof(struct tcphdr));
ip->ip_ttl = inp->inp_ip_ttl;
ip->ip_p = IPPROTO_TCP;
if (open_type == OPEN_TYPE_ACTIVE) {
ip->ip_src = inp->inp_laddr;
ip->ip_dst = inp->inp_faddr;
} else if (open_type == OPEN_TYPE_LISTEN) {
ip->ip_src = inp->inp_laddr;
ip->ip_dst = ip->ip_src;
}
len += sizeof(*ip);
}
th = (void *)&pkt[len];
if (open_type == OPEN_TYPE_ACTIVE) {
th->th_sport = inp->inp_lport; /* network byte order already */
th->th_dport = inp->inp_fport; /* ditto */
} else if (open_type == OPEN_TYPE_LISTEN) {
th->th_sport = inp->inp_lport; /* network byte order already */
th->th_dport = th->th_sport;
}
len += sizeof(th);
*pktlen = *buflen = len;
return (pkt);
}
const struct offload_settings *
lookup_offload_policy(struct adapter *sc, int open_type, struct mbuf *m,
uint16_t vtag, struct inpcb *inp)
{
const struct t4_offload_policy *op;
char *pkt;
struct offload_rule *r;
int i, matched, pktlen, buflen;
static const struct offload_settings allow_offloading_settings = {
.offload = 1,
.rx_coalesce = -1,
.cong_algo = -1,
.sched_class = -1,
.tstamp = -1,
.sack = -1,
.nagle = -1,
.ecn = -1,
.ddp = -1,
.tls = -1,
.txq = QUEUE_RANDOM,
.rxq = QUEUE_RANDOM,
.mss = -1,
};
static const struct offload_settings disallow_offloading_settings = {
.offload = 0,
/* rest is irrelevant when offload is off. */
};
rw_assert(&sc->policy_lock, RA_LOCKED);
/*
* If there's no Connection Offloading Policy attached to the device
* then we need to return a default static policy. If
* "cop_managed_offloading" is true, then we need to disallow
* offloading until a COP is attached to the device. Otherwise we
* allow offloading ...
*/
op = sc->policy;
if (op == NULL) {
if (sc->tt.cop_managed_offloading)
return (&disallow_offloading_settings);
else
return (&allow_offloading_settings);
}
switch (open_type) {
case OPEN_TYPE_ACTIVE:
case OPEN_TYPE_LISTEN:
pkt = prepare_pkt(open_type, vtag, inp, &pktlen, &buflen);
break;
case OPEN_TYPE_PASSIVE:
MPASS(m != NULL);
pkt = mtod(m, char *);
MPASS(*pkt == CPL_PASS_ACCEPT_REQ);
pkt += sizeof(struct cpl_pass_accept_req);
pktlen = m->m_pkthdr.len - sizeof(struct cpl_pass_accept_req);
buflen = m->m_len - sizeof(struct cpl_pass_accept_req);
break;
default:
MPASS(0);
return (&disallow_offloading_settings);
}
if (pkt == NULL || pktlen == 0 || buflen == 0)
return (&disallow_offloading_settings);
matched = 0;
r = &op->rule[0];
for (i = 0; i < op->nrules; i++, r++) {
if (r->open_type != open_type &&
r->open_type != OPEN_TYPE_DONTCARE) {
continue;
}
matched = bpf_filter(r->bpf_prog.bf_insns, pkt, pktlen, buflen);
if (matched)
break;
}
if (open_type == OPEN_TYPE_ACTIVE || open_type == OPEN_TYPE_LISTEN)
free(pkt, M_CXGBE);
return (matched ? &r->settings : &disallow_offloading_settings);
}
static void
reclaim_wr_resources(void *arg, int count)
{
struct tom_data *td = arg;
STAILQ_HEAD(, wrqe) twr_list = STAILQ_HEAD_INITIALIZER(twr_list);
struct cpl_act_open_req *cpl;
u_int opcode, atid, tid;
struct wrqe *wr;
struct adapter *sc = td_adapter(td);
mtx_lock(&td->unsent_wr_lock);
STAILQ_SWAP(&td->unsent_wr_list, &twr_list, wrqe);
mtx_unlock(&td->unsent_wr_lock);
while ((wr = STAILQ_FIRST(&twr_list)) != NULL) {
STAILQ_REMOVE_HEAD(&twr_list, link);
cpl = wrtod(wr);
opcode = GET_OPCODE(cpl);
switch (opcode) {
case CPL_ACT_OPEN_REQ:
case CPL_ACT_OPEN_REQ6:
atid = G_TID_TID(be32toh(OPCODE_TID(cpl)));
CTR2(KTR_CXGBE, "%s: atid %u ", __func__, atid);
- act_open_failure_cleanup(sc, atid, EHOSTUNREACH);
+ act_open_failure_cleanup(sc, lookup_atid(sc, atid),
+ EHOSTUNREACH);
free(wr, M_CXGBE);
break;
case CPL_PASS_ACCEPT_RPL:
tid = GET_TID(cpl);
CTR2(KTR_CXGBE, "%s: tid %u ", __func__, tid);
- synack_failure_cleanup(sc, tid);
+ synack_failure_cleanup(sc, lookup_tid(sc, tid));
free(wr, M_CXGBE);
break;
default:
log(LOG_ERR, "%s: leaked work request %p, wr_len %d, "
"opcode %x\n", __func__, wr, wr->wr_len, opcode);
/* WR not freed here; go look at it with a debugger. */
}
}
}
+/*
+ * Based on do_abort_req. We treat an abrupt hardware stop as a connection
+ * abort from the hardware.
+ */
+static void
+live_tid_failure_cleanup(struct adapter *sc, struct toepcb *toep, u_int status)
+{
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ struct epoch_tracker et;
+
+ MPASS(!(toep->flags & TPF_SYNQE));
+
+ inp = toep->inp;
+ CURVNET_SET(toep->vnet);
+ NET_EPOCH_ENTER(et); /* for tcp_close */
+ INP_WLOCK(inp);
+ tp = intotcpcb(inp);
+ toep->flags |= TPF_ABORT_SHUTDOWN;
+ if ((inp->inp_flags & INP_DROPPED) == 0) {
+ struct socket *so = inp->inp_socket;
+
+ if (so != NULL)
+ so_error_set(so, status);
+ tp = tcp_close(tp);
+ if (tp == NULL)
+ INP_WLOCK(inp); /* re-acquire */
+ }
+ final_cpl_received(toep);
+ NET_EPOCH_EXIT(et);
+ CURVNET_RESTORE();
+}
+
+static void
+cleanup_stranded_tids(void *arg, int count)
+{
+ TAILQ_HEAD(, toepcb) tlist = TAILQ_HEAD_INITIALIZER(tlist);
+ TAILQ_HEAD(, synq_entry) slist = TAILQ_HEAD_INITIALIZER(slist);
+ struct tom_data *td = arg;
+ struct adapter *sc = td_adapter(td);
+ struct toepcb *toep;
+ struct synq_entry *synqe;
+
+ /* Clean up synq entries. */
+ mtx_lock(&td->toep_list_lock);
+ TAILQ_SWAP(&td->stranded_synqe, &slist, synq_entry, link);
+ mtx_unlock(&td->toep_list_lock);
+ while ((synqe = TAILQ_FIRST(&slist)) != NULL) {
+ TAILQ_REMOVE(&slist, synqe, link);
+ MPASS(synqe->tid >= 0); /* stale, was kept around for debug */
+ synqe->tid = -1;
+ synack_failure_cleanup(sc, synqe);
+ }
+
+ /* Clean up in-flight active opens. */
+ mtx_lock(&td->toep_list_lock);
+ TAILQ_SWAP(&td->stranded_atids, &tlist, toepcb, link);
+ mtx_unlock(&td->toep_list_lock);
+ while ((toep = TAILQ_FIRST(&tlist)) != NULL) {
+ TAILQ_REMOVE(&tlist, toep, link);
+ MPASS(toep->tid >= 0); /* stale, was kept around for debug */
+ toep->tid = -1;
+ act_open_failure_cleanup(sc, toep, EHOSTUNREACH);
+ }
+
+ /* Clean up live connections. */
+ mtx_lock(&td->toep_list_lock);
+ TAILQ_SWAP(&td->stranded_tids, &tlist, toepcb, link);
+ mtx_unlock(&td->toep_list_lock);
+ while ((toep = TAILQ_FIRST(&tlist)) != NULL) {
+ TAILQ_REMOVE(&tlist, toep, link);
+ MPASS(toep->tid >= 0); /* stale, was kept around for debug */
+ toep->tid = -1;
+ live_tid_failure_cleanup(sc, toep, ECONNABORTED);
+ }
+}
+
/*
* Ground control to Major TOM
* Commencing countdown, engines on
*/
static int
t4_tom_activate(struct adapter *sc)
{
struct tom_data *td;
struct toedev *tod;
struct vi_info *vi;
int i, rc, v;
ASSERT_SYNCHRONIZED_OP(sc);
/* per-adapter softc for TOM */
td = malloc(sizeof(*td), M_CXGBE, M_ZERO | M_NOWAIT);
if (td == NULL)
return (ENOMEM);
/* List of TOE PCBs and associated lock */
mtx_init(&td->toep_list_lock, "PCB list lock", NULL, MTX_DEF);
TAILQ_INIT(&td->toep_list);
TAILQ_INIT(&td->synqe_list);
+ TAILQ_INIT(&td->stranded_atids);
+ TAILQ_INIT(&td->stranded_tids);
+ TASK_INIT(&td->cleanup_stranded_tids, 0, cleanup_stranded_tids, td);
/* Listen context */
mtx_init(&td->lctx_hash_lock, "lctx hash lock", NULL, MTX_DEF);
td->listen_hash = hashinit_flags(LISTEN_HASH_SIZE, M_CXGBE,
&td->listen_mask, HASH_NOWAIT);
/* List of WRs for which L2 resolution failed */
mtx_init(&td->unsent_wr_lock, "Unsent WR list lock", NULL, MTX_DEF);
STAILQ_INIT(&td->unsent_wr_list);
TASK_INIT(&td->reclaim_wr_resources, 0, reclaim_wr_resources, td);
/* TID tables */
- rc = alloc_tid_tabs(&sc->tids);
+ rc = alloc_tid_tabs(sc);
if (rc != 0)
goto done;
rc = t4_init_ppod_region(&td->pr, &sc->vres.ddp,
t4_read_reg(sc, A_ULP_RX_TDDP_PSZ), "TDDP page pods");
if (rc != 0)
goto done;
t4_set_reg_field(sc, A_ULP_RX_TDDP_TAGMASK,
V_TDDPTAGMASK(M_TDDPTAGMASK), td->pr.pr_tag_mask);
alloc_tcb_history(sc, td);
/* toedev ops */
tod = &td->tod;
init_toedev(tod);
tod->tod_softc = sc;
tod->tod_connect = t4_connect;
tod->tod_listen_start = t4_listen_start;
tod->tod_listen_stop = t4_listen_stop;
tod->tod_rcvd = t4_rcvd;
tod->tod_output = t4_tod_output;
tod->tod_send_rst = t4_send_rst;
tod->tod_send_fin = t4_send_fin;
tod->tod_pcb_detach = t4_pcb_detach;
tod->tod_l2_update = t4_l2_update;
tod->tod_syncache_added = t4_syncache_added;
tod->tod_syncache_removed = t4_syncache_removed;
tod->tod_syncache_respond = t4_syncache_respond;
tod->tod_offload_socket = t4_offload_socket;
tod->tod_ctloutput = t4_ctloutput;
tod->tod_tcp_info = t4_tcp_info;
#ifdef KERN_TLS
tod->tod_alloc_tls_session = t4_alloc_tls_session;
#endif
tod->tod_pmtu_update = t4_pmtu_update;
for_each_port(sc, i) {
for_each_vi(sc->port[i], v, vi) {
SETTOEDEV(vi->ifp, &td->tod);
}
}
sc->tom_softc = td;
register_toedev(sc->tom_softc);
done:
if (rc != 0)
free_tom_data(sc, td);
return (rc);
}
static int
t4_tom_deactivate(struct adapter *sc)
{
int rc = 0;
struct tom_data *td = sc->tom_softc;
ASSERT_SYNCHRONIZED_OP(sc);
if (td == NULL)
return (0); /* XXX. KASSERT? */
if (sc->offload_map != 0)
return (EBUSY); /* at least one port has IFCAP_TOE enabled */
if (uld_active(sc, ULD_IWARP) || uld_active(sc, ULD_ISCSI))
return (EBUSY); /* both iWARP and iSCSI rely on the TOE. */
mtx_lock(&td->toep_list_lock);
if (!TAILQ_EMPTY(&td->toep_list))
rc = EBUSY;
MPASS(TAILQ_EMPTY(&td->synqe_list));
+ MPASS(TAILQ_EMPTY(&td->stranded_tids));
+ mtx_unlock(&td->toep_list_lock);
mtx_unlock(&td->toep_list_lock);
mtx_lock(&td->lctx_hash_lock);
if (td->lctx_count > 0)
rc = EBUSY;
mtx_unlock(&td->lctx_hash_lock);
taskqueue_drain(taskqueue_thread, &td->reclaim_wr_resources);
+ taskqueue_drain(taskqueue_thread, &td->cleanup_stranded_tids);
mtx_lock(&td->unsent_wr_lock);
if (!STAILQ_EMPTY(&td->unsent_wr_list))
rc = EBUSY;
mtx_unlock(&td->unsent_wr_lock);
if (rc == 0) {
unregister_toedev(sc->tom_softc);
free_tom_data(sc, td);
sc->tom_softc = NULL;
}
return (rc);
}
+static void
+stop_atids(struct adapter *sc)
+{
+ struct tom_data *td = sc->tom_softc;
+ struct tid_info *t = &sc->tids;
+ struct toepcb *toep;
+ int atid;
+
+ /*
+ * Hashfilters and T6-KTLS are the only other users of atids but they're
+ * both mutually exclusive with TOE. That means t4_tom owns all the
+ * atids in the table.
+ */
+ MPASS(!is_hashfilter(sc));
+ if (is_t6(sc))
+ MPASS(!(sc->flags & KERN_TLS_ON));
+
+ /* New atids are not being allocated. */
+#ifdef INVARIANTS
+ mtx_lock(&t->atid_lock);
+ MPASS(t->atid_alloc_stopped == true);
+ mtx_unlock(&t->atid_lock);
+#endif
+
+ /*
+ * In-use atids fall in one of these two categories:
+ * a) Those waiting for L2 resolution before being submitted to
+ * hardware.
+ * b) Those that have been submitted to hardware and are awaiting
+ * replies that will never arrive because the LLD is stopped.
+ */
+ for (atid = 0; atid < t->natids; atid++) {
+ toep = lookup_atid(sc, atid);
+ if ((uintptr_t)toep >= (uintptr_t)&t->atid_tab[0] &&
+ (uintptr_t)toep < (uintptr_t)&t->atid_tab[t->natids])
+ continue;
+ MPASS(toep->tid == atid);
+ MPASS(toep->incarnation == sc->incarnation);
+ /*
+ * Take the atid out of the lookup table. toep->tid is stale
+ * after this but useful for debug.
+ */
+ CTR(KTR_CXGBE, "%s: atid %d@%d STRANDED, removed from table",
+ __func__, atid, toep->incarnation);
+ free_atid(sc, toep->tid);
+#if 0
+ toep->tid = -1;
+#endif
+ mtx_lock(&td->toep_list_lock);
+ TAILQ_INSERT_TAIL(&td->stranded_atids, toep, link);
+ mtx_unlock(&td->toep_list_lock);
+ }
+ MPASS(atomic_load_int(&t->atids_in_use) == 0);
+}
+
+static void
+stop_tids(struct adapter *sc)
+{
+ struct tom_data *td = sc->tom_softc;
+ struct toepcb *toep;
+#ifdef INVARIANTS
+ struct tid_info *t = &sc->tids;
+#endif
+
+ /*
+ * The LLD's offload queues are stopped so do_act_establish and
+ * do_pass_accept_req cannot run and insert tids in parallel with this
+ * thread. stop_stid_tab has also run and removed the synq entries'
+ * tids from the table. The only tids in the table are for connections
+ * at or beyond ESTABLISHED that are still waiting for the final CPL.
+ */
+ mtx_lock(&td->toep_list_lock);
+ TAILQ_FOREACH(toep, &td->toep_list, link) {
+ MPASS(sc->incarnation == toep->incarnation);
+ MPASS(toep->tid >= 0);
+ MPASS(toep == lookup_tid(sc, toep->tid));
+ /* Remove tid from the lookup table immediately. */
+ CTR(KTR_CXGBE, "%s: tid %d@%d STRANDED, removed from table",
+ __func__, toep->tid, toep->incarnation);
+ remove_tid(sc, toep->tid, toep->ce ? 2 : 1);
+#if 0
+ /* toep->tid is stale now but left alone for debug. */
+ toep->tid = -1;
+#endif
+ /* All toep in this list will get bulk moved to stranded_tids */
+ toep->flags &= ~TPF_IN_TOEP_LIST;
+ }
+ MPASS(TAILQ_EMPTY(&td->stranded_tids));
+ TAILQ_CONCAT(&td->stranded_tids, &td->toep_list, link);
+ MPASS(TAILQ_EMPTY(&td->toep_list));
+ mtx_unlock(&td->toep_list_lock);
+
+ MPASS(atomic_load_int(&t->tids_in_use) == 0);
+}
+
+/*
+ * L2T is stable because
+ * 1. stop_lld stopped all new allocations.
+ * 2. stop_lld also stopped the tx wrq so nothing is enqueueing new WRs to the
+ * queue or to l2t_entry->wr_list.
+ * 3. t4_l2t_update is ignoring all L2 updates.
+ */
+static void
+stop_tom_l2t(struct adapter *sc)
+{
+ struct l2t_data *d = sc->l2t;
+ struct l2t_entry *e;
+ int i;
+
+ for (i = 0; i < d->l2t_size; i++) {
+ e = &d->l2tab[i];
+ mtx_lock(&e->lock);
+ if (e->state == L2T_STATE_VALID)
+ e->state = L2T_STATE_RESOLVING;
+ if (!STAILQ_EMPTY(&e->wr_list))
+ CXGBE_UNIMPLEMENTED("l2t e->wr_list");
+ mtx_unlock(&e->lock);
+ }
+}
+
+static int
+t4_tom_stop(struct adapter *sc)
+{
+ struct tid_info *t = &sc->tids;
+ struct tom_data *td = sc->tom_softc;
+
+ ASSERT_SYNCHRONIZED_OP(sc);
+
+ stop_tom_l2t(sc);
+ if (atomic_load_int(&t->atids_in_use) > 0)
+ stop_atids(sc);
+ if (atomic_load_int(&t->stids_in_use) > 0)
+ stop_stid_tab(sc);
+ if (atomic_load_int(&t->tids_in_use) > 0)
+ stop_tids(sc);
+ taskqueue_enqueue(taskqueue_thread, &td->cleanup_stranded_tids);
+
+ return (0);
+}
+
+static int
+t4_tom_restart(struct adapter *sc)
+{
+ ASSERT_SYNCHRONIZED_OP(sc);
+
+ restart_stid_tab(sc);
+
+ return (0);
+}
+
static int
t4_ctloutput_tom(struct socket *so, struct sockopt *sopt)
{
struct tcpcb *tp = sototcpcb(so);
struct toepcb *toep = tp->t_toe;
int error, optval;
if (sopt->sopt_level == IPPROTO_TCP && sopt->sopt_name == TCP_USE_DDP) {
if (sopt->sopt_dir != SOPT_SET)
return (EOPNOTSUPP);
if (sopt->sopt_td != NULL) {
/* Only settable by the kernel. */
return (EPERM);
}
error = sooptcopyin(sopt, &optval, sizeof(optval),
sizeof(optval));
if (error != 0)
return (error);
if (optval != 0)
return (t4_enable_ddp_rcv(so, toep));
else
return (EOPNOTSUPP);
}
return (tcp_ctloutput(so, sopt));
}
static int
t4_aio_queue_tom(struct socket *so, struct kaiocb *job)
{
struct tcpcb *tp = sototcpcb(so);
struct toepcb *toep = tp->t_toe;
int error;
/*
* No lock is needed as TOE sockets never change between
* active and passive.
*/
if (SOLISTENING(so))
return (EINVAL);
if (ulp_mode(toep) == ULP_MODE_TCPDDP ||
ulp_mode(toep) == ULP_MODE_NONE) {
error = t4_aio_queue_ddp(so, job);
if (error != EOPNOTSUPP)
return (error);
}
return (t4_aio_queue_aiotx(so, job));
}
static int
t4_tom_mod_load(void)
{
/* CPL handlers */
t4_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
t4_register_shared_cpl_handler(CPL_L2T_WRITE_RPL, do_l2t_write_rpl2,
CPL_COOKIE_TOM);
t4_init_connect_cpl_handlers();
t4_init_listen_cpl_handlers();
t4_init_cpl_io_handlers();
t4_ddp_mod_load();
t4_tls_mod_load();
bcopy(&tcp_protosw, &toe_protosw, sizeof(toe_protosw));
toe_protosw.pr_ctloutput = t4_ctloutput_tom;
toe_protosw.pr_aio_queue = t4_aio_queue_tom;
bcopy(&tcp6_protosw, &toe6_protosw, sizeof(toe6_protosw));
toe6_protosw.pr_ctloutput = t4_ctloutput_tom;
toe6_protosw.pr_aio_queue = t4_aio_queue_tom;
return (t4_register_uld(&tom_uld_info, ULD_TOM));
}
static void
tom_uninit(struct adapter *sc, void *arg __unused)
{
if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4tomun"))
return;
/* Try to free resources (works only if no port has IFCAP_TOE) */
if (uld_active(sc, ULD_TOM))
t4_deactivate_uld(sc, ULD_TOM);
end_synchronized_op(sc, 0);
}
static int
t4_tom_mod_unload(void)
{
t4_iterate(tom_uninit, NULL);
if (t4_unregister_uld(&tom_uld_info, ULD_TOM) == EBUSY)
return (EBUSY);
t4_tls_mod_unload();
t4_ddp_mod_unload();
t4_uninit_connect_cpl_handlers();
t4_uninit_listen_cpl_handlers();
t4_uninit_cpl_io_handlers();
t4_register_shared_cpl_handler(CPL_L2T_WRITE_RPL, NULL, CPL_COOKIE_TOM);
t4_register_cpl_handler(CPL_GET_TCB_RPL, NULL);
return (0);
}
#endif /* TCP_OFFLOAD */
static int
t4_tom_modevent(module_t mod, int cmd, void *arg)
{
int rc = 0;
#ifdef TCP_OFFLOAD
switch (cmd) {
case MOD_LOAD:
rc = t4_tom_mod_load();
break;
case MOD_UNLOAD:
rc = t4_tom_mod_unload();
break;
default:
rc = EINVAL;
}
#else
printf("t4_tom: compiled without TCP_OFFLOAD support.\n");
rc = EOPNOTSUPP;
#endif
return (rc);
}
static moduledata_t t4_tom_moddata= {
"t4_tom",
t4_tom_modevent,
0
};
MODULE_VERSION(t4_tom, 1);
MODULE_DEPEND(t4_tom, toecore, 1, 1, 1);
MODULE_DEPEND(t4_tom, t4nex, 1, 1, 1);
DECLARE_MODULE(t4_tom, t4_tom_moddata, SI_SUB_EXEC, SI_ORDER_ANY);
diff --git a/sys/dev/cxgbe/tom/t4_tom.h b/sys/dev/cxgbe/tom/t4_tom.h
index 0bc368fe3d56..8debab4940b1 100644
--- a/sys/dev/cxgbe/tom/t4_tom.h
+++ b/sys/dev/cxgbe/tom/t4_tom.h
@@ -1,565 +1,578 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2012, 2015 Chelsio Communications, Inc.
* All rights reserved.
* Written by: Navdeep Parhar <np@FreeBSD.org>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
#ifndef __T4_TOM_H__
#define __T4_TOM_H__
#include <sys/vmem.h>
#include "common/t4_hw.h"
#include "common/t4_msg.h"
#include "tom/t4_tls.h"
#define LISTEN_HASH_SIZE 32
/*
* Min receive window. We want it to be large enough to accommodate receive
* coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
*/
#define MIN_RCV_WND (24 * 1024U)
/*
* Max receive window supported by HW in bytes. Only a small part of it can
* be set through option0, the rest needs to be set through RX_DATA_ACK.
*/
#define MAX_RCV_WND ((1U << 27) - 1)
#define DDP_RSVD_WIN (16 * 1024U)
#define SB_DDP_INDICATE SB_IN_TOE /* soreceive must respond to indicate */
#define USE_DDP_RX_FLOW_CONTROL
#define PPOD_SZ(n) ((n) * sizeof(struct pagepod))
#define PPOD_SIZE (PPOD_SZ(1))
/* TOE PCB flags */
enum {
TPF_ATTACHED = (1 << 0), /* a tcpcb refers to this toepcb */
TPF_FLOWC_WR_SENT = (1 << 1), /* firmware flow context WR sent */
TPF_TX_DATA_SENT = (1 << 2), /* some data sent */
TPF_TX_SUSPENDED = (1 << 3), /* tx suspended for lack of resources */
TPF_SEND_FIN = (1 << 4), /* send FIN after all pending data */
TPF_FIN_SENT = (1 << 5), /* FIN has been sent */
TPF_ABORT_SHUTDOWN = (1 << 6), /* connection abort is in progress */
TPF_CPL_PENDING = (1 << 7), /* haven't received the last CPL */
TPF_SYNQE = (1 << 8), /* synq_entry, not really a toepcb */
TPF_SYNQE_EXPANDED = (1 << 9), /* toepcb ready, tid context updated */
TPF_TLS_STARTING = (1 << 10), /* starting TLS receive */
TPF_KTLS = (1 << 11), /* send TLS records from KTLS */
TPF_INITIALIZED = (1 << 12), /* init_toepcb has been called */
TPF_TLS_RECEIVE = (1 << 13), /* should receive TLS records */
TPF_TLS_RX_QUIESCING = (1 << 14), /* RX quiesced for TLS RX startup */
TPF_TLS_RX_QUIESCED = (1 << 15), /* RX quiesced for TLS RX startup */
TPF_WAITING_FOR_FINAL = (1<< 16), /* waiting for wakeup on final CPL */
+ TPF_IN_TOEP_LIST = (1 << 17), /* toep is in the main td->toep_list */
};
enum {
DDP_OK = (1 << 0), /* OK to turn on DDP */
DDP_SC_REQ = (1 << 1), /* state change (on/off) requested */
DDP_ON = (1 << 2), /* DDP is turned on */
DDP_BUF0_ACTIVE = (1 << 3), /* buffer 0 in use (not invalidated) */
DDP_BUF1_ACTIVE = (1 << 4), /* buffer 1 in use (not invalidated) */
DDP_TASK_ACTIVE = (1 << 5), /* requeue task is queued / running */
DDP_DEAD = (1 << 6), /* toepcb is shutting down */
DDP_AIO = (1 << 7), /* DDP used for AIO, not so_rcv */
DDP_RCVBUF = (1 << 8), /* DDP used for so_rcv, not AIO */
};
struct bio;
struct ctl_sg_entry;
struct sockopt;
struct offload_settings;
/*
* Connection parameters for an offloaded connection. These are mostly (but not
* all) hardware TOE parameters.
*/
struct conn_params {
int8_t rx_coalesce;
int8_t cong_algo;
int8_t tc_idx;
int8_t tstamp;
int8_t sack;
int8_t nagle;
int8_t keepalive;
int8_t wscale;
int8_t ecn;
int8_t mtu_idx;
int8_t ulp_mode;
int8_t tx_align;
int16_t txq_idx; /* ofld_txq = &sc->sge.ofld_txq[txq_idx] */
int16_t rxq_idx; /* ofld_rxq = &sc->sge.ofld_rxq[rxq_idx] */
int16_t l2t_idx;
uint16_t emss;
uint16_t opt0_bufsize;
u_int sndbuf; /* controls TP tx pages */
};
struct ofld_tx_sdesc {
uint32_t plen; /* payload length */
uint8_t tx_credits; /* firmware tx credits (unit is 16B) */
};
struct ppod_region {
u_int pr_start;
u_int pr_len;
u_int pr_page_shift[4];
uint32_t pr_tag_mask; /* hardware tagmask for this region. */
uint32_t pr_invalid_bit; /* OR with this to invalidate tag. */
uint32_t pr_alias_mask; /* AND with tag to get alias bits. */
u_int pr_alias_shift; /* shift this much for first alias bit. */
vmem_t *pr_arena;
};
struct ppod_reservation {
struct ppod_region *prsv_pr;
uint32_t prsv_tag; /* Full tag: pgsz, alias, tag, color */
u_int prsv_nppods;
};
struct pageset {
TAILQ_ENTRY(pageset) link;
vm_page_t *pages;
int npages;
int flags;
int offset; /* offset in first page */
int len;
struct ppod_reservation prsv;
struct vmspace *vm;
vm_offset_t start;
u_int vm_timestamp;
};
TAILQ_HEAD(pagesetq, pageset);
#define PS_PPODS_WRITTEN 0x0001 /* Page pods written to the card. */
struct ddp_rcv_buffer {
TAILQ_ENTRY(ddp_rcv_buffer) link;
void *buf;
struct ppod_reservation prsv;
size_t len;
u_int refs;
};
struct ddp_buffer {
union {
/* DDP_AIO fields */
struct {
struct pageset *ps;
struct kaiocb *job;
int cancel_pending;
};
/* DDP_RCVBUF fields */
struct {
struct ddp_rcv_buffer *drb;
uint32_t placed;
};
};
};
/*
* (a) - DDP_AIO only
* (r) - DDP_RCVBUF only
*/
struct ddp_pcb {
struct mtx lock;
u_int flags;
int active_id; /* the currently active DDP buffer */
struct ddp_buffer db[2];
union {
TAILQ_HEAD(, pageset) cached_pagesets; /* (a) */
TAILQ_HEAD(, ddp_rcv_buffer) cached_buffers; /* (r) */
};
TAILQ_HEAD(, kaiocb) aiojobq; /* (a) */
u_int waiting_count; /* (a) */
u_int active_count;
u_int cached_count;
struct task requeue_task;
struct kaiocb *queueing; /* (a) */
struct mtx cache_lock; /* (r) */
};
struct toepcb {
struct tom_data *td;
struct inpcb *inp; /* backpointer to host stack's PCB */
u_int flags; /* miscellaneous flags */
- TAILQ_ENTRY(toepcb) link; /* toep_list */
+ TAILQ_ENTRY(toepcb) link; /* toep_list or stranded_toep_list */
int refcount;
struct vnet *vnet;
struct vi_info *vi; /* virtual interface */
struct sge_ofld_txq *ofld_txq;
struct sge_ofld_rxq *ofld_rxq;
struct sge_wrq *ctrlq;
struct l2t_entry *l2te; /* L2 table entry used by this connection */
struct clip_entry *ce; /* CLIP table entry used by this tid */
int tid; /* Connection identifier */
+ int incarnation; /* sc->incarnation when toepcb was allocated */
/* tx credit handling */
u_int tx_total; /* total tx WR credits (in 16B units) */
u_int tx_credits; /* tx WR credits (in 16B units) available */
u_int tx_nocompl; /* tx WR credits since last compl request */
u_int plen_nocompl; /* payload since last compl request */
struct conn_params params;
void *ulpcb;
void *ulpcb2;
struct mbufq ulp_pduq; /* PDUs waiting to be sent out. */
struct mbufq ulp_pdu_reclaimq;
struct ddp_pcb ddp;
struct tls_ofld_info tls;
TAILQ_HEAD(, kaiocb) aiotx_jobq;
struct task aiotx_task;
struct socket *aiotx_so;
/* Tx software descriptor */
uint8_t txsd_total;
uint8_t txsd_pidx;
uint8_t txsd_cidx;
uint8_t txsd_avail;
struct ofld_tx_sdesc txsd[];
};
static inline int
ulp_mode(struct toepcb *toep)
{
return (toep->params.ulp_mode);
}
#define DDP_LOCK(toep) mtx_lock(&(toep)->ddp.lock)
#define DDP_UNLOCK(toep) mtx_unlock(&(toep)->ddp.lock)
#define DDP_ASSERT_LOCKED(toep) mtx_assert(&(toep)->ddp.lock, MA_OWNED)
#define DDP_CACHE_LOCK(toep) mtx_lock(&(toep)->ddp.cache_lock)
#define DDP_CACHE_UNLOCK(toep) mtx_unlock(&(toep)->ddp.cache_lock)
/*
* Compressed state for embryonic connections for a listener.
*/
struct synq_entry {
struct listen_ctx *lctx; /* backpointer to listen ctx */
struct mbuf *syn;
int flags; /* same as toepcb's tp_flags */
TAILQ_ENTRY(synq_entry) link; /* synqe_list */
volatile int ok_to_respond;
volatile u_int refcnt;
int tid;
uint32_t iss;
uint32_t irs;
uint32_t ts;
uint32_t rss_hash;
__be16 tcp_opt; /* from cpl_pass_establish */
+ int incarnation;
struct toepcb *toep;
struct conn_params params;
};
/* listen_ctx flags */
#define LCTX_RPL_PENDING 1 /* waiting for a CPL_PASS_OPEN_RPL */
+#define LCTX_SETUP_IN_HW 2 /* stid entry is setup in hardware */
struct listen_ctx {
LIST_ENTRY(listen_ctx) link; /* listen hash linkage */
volatile int refcount;
int stid;
struct stid_region stid_region;
int flags;
struct inpcb *inp; /* listening socket's inp */
struct vnet *vnet;
struct sge_wrq *ctrlq;
struct sge_ofld_rxq *ofld_rxq;
struct clip_entry *ce;
};
/* tcb_histent flags */
#define TE_RPL_PENDING 1
#define TE_ACTIVE 2
/* bits in one 8b tcb_histent sample. */
#define TS_RTO (1 << 0)
#define TS_DUPACKS (1 << 1)
#define TS_FASTREXMT (1 << 2)
#define TS_SND_BACKLOGGED (1 << 3)
#define TS_CWND_LIMITED (1 << 4)
#define TS_ECN_ECE (1 << 5)
#define TS_ECN_CWR (1 << 6)
#define TS_RESERVED (1 << 7) /* Unused. */
struct tcb_histent {
struct mtx te_lock;
struct callout te_callout;
uint64_t te_tcb[TCB_SIZE / sizeof(uint64_t)];
struct adapter *te_adapter;
u_int te_flags;
u_int te_tid;
uint8_t te_pidx;
uint8_t te_sample[100];
};
struct tom_data {
struct toedev tod;
/* toepcb's associated with this TOE device */
struct mtx toep_list_lock;
TAILQ_HEAD(, toepcb) toep_list;
TAILQ_HEAD(, synq_entry) synqe_list;
+ /* List of tids left stranded because hw stopped abruptly. */
+ TAILQ_HEAD(, toepcb) stranded_atids;
+ TAILQ_HEAD(, toepcb) stranded_tids;
+ TAILQ_HEAD(, synq_entry) stranded_synqe;
+ struct task cleanup_stranded_tids;
struct mtx lctx_hash_lock;
LIST_HEAD(, listen_ctx) *listen_hash;
u_long listen_mask;
int lctx_count; /* # of lctx in the hash table */
struct ppod_region pr;
struct rwlock tcb_history_lock __aligned(CACHE_LINE_SIZE);
struct tcb_histent **tcb_history;
int dupack_threshold;
/* WRs that will not be sent to the chip because L2 resolution failed */
struct mtx unsent_wr_lock;
STAILQ_HEAD(, wrqe) unsent_wr_list;
struct task reclaim_wr_resources;
};
static inline struct tom_data *
tod_td(struct toedev *tod)
{
return (__containerof(tod, struct tom_data, tod));
}
static inline struct adapter *
td_adapter(struct tom_data *td)
{
return (td->tod.tod_softc);
}
static inline void
set_mbuf_raw_wr(struct mbuf *m, bool raw)
{
M_ASSERTPKTHDR(m);
m->m_pkthdr.PH_per.eight[6] = raw;
}
static inline bool
mbuf_raw_wr(struct mbuf *m)
{
M_ASSERTPKTHDR(m);
return (m->m_pkthdr.PH_per.eight[6]);
}
static inline void
set_mbuf_ulp_submode(struct mbuf *m, uint8_t ulp_submode)
{
M_ASSERTPKTHDR(m);
m->m_pkthdr.PH_per.eight[0] = ulp_submode;
}
static inline uint8_t
mbuf_ulp_submode(struct mbuf *m)
{
M_ASSERTPKTHDR(m);
return (m->m_pkthdr.PH_per.eight[0]);
}
static inline void
set_mbuf_iscsi_iso(struct mbuf *m, bool iso)
{
M_ASSERTPKTHDR(m);
m->m_pkthdr.PH_per.eight[1] = iso;
}
static inline bool
mbuf_iscsi_iso(struct mbuf *m)
{
M_ASSERTPKTHDR(m);
return (m->m_pkthdr.PH_per.eight[1]);
}
/* Flags for iSCSI segmentation offload. */
#define CXGBE_ISO_TYPE(flags) ((flags) & 0x3)
#define CXGBE_ISO_F 0x4
static inline void
set_mbuf_iscsi_iso_flags(struct mbuf *m, uint8_t flags)
{
M_ASSERTPKTHDR(m);
m->m_pkthdr.PH_per.eight[2] = flags;
}
static inline uint8_t
mbuf_iscsi_iso_flags(struct mbuf *m)
{
M_ASSERTPKTHDR(m);
return (m->m_pkthdr.PH_per.eight[2]);
}
static inline void
set_mbuf_iscsi_iso_mss(struct mbuf *m, uint16_t mss)
{
M_ASSERTPKTHDR(m);
m->m_pkthdr.PH_per.sixteen[2] = mss;
}
static inline uint16_t
mbuf_iscsi_iso_mss(struct mbuf *m)
{
M_ASSERTPKTHDR(m);
return (m->m_pkthdr.PH_per.sixteen[2]);
}
/* t4_tom.c */
struct toepcb *alloc_toepcb(struct vi_info *, int);
int init_toepcb(struct vi_info *, struct toepcb *);
struct toepcb *hold_toepcb(struct toepcb *);
void free_toepcb(struct toepcb *);
void offload_socket(struct socket *, struct toepcb *);
void restore_so_proto(struct socket *, bool);
void undo_offload_socket(struct socket *);
void final_cpl_received(struct toepcb *);
void insert_tid(struct adapter *, int, void *, int);
void *lookup_tid(struct adapter *, int);
void update_tid(struct adapter *, int, void *);
void remove_tid(struct adapter *, int, int);
u_long select_rcv_wnd(struct socket *);
int select_rcv_wscale(void);
void init_conn_params(struct vi_info *, struct offload_settings *,
struct in_conninfo *, struct socket *, const struct tcp_options *, int16_t,
struct conn_params *cp);
__be64 calc_options0(struct vi_info *, struct conn_params *);
__be32 calc_options2(struct vi_info *, struct conn_params *);
uint64_t select_ntuple(struct vi_info *, struct l2t_entry *);
int negative_advice(int);
int add_tid_to_history(struct adapter *, u_int);
void t4_pcb_detach(struct toedev *, struct tcpcb *);
/* t4_connect.c */
void t4_init_connect_cpl_handlers(void);
void t4_uninit_connect_cpl_handlers(void);
int t4_connect(struct toedev *, struct socket *, struct nhop_object *,
struct sockaddr *);
-void act_open_failure_cleanup(struct adapter *, u_int, u_int);
+void act_open_failure_cleanup(struct adapter *, struct toepcb *, u_int);
/* t4_listen.c */
void t4_init_listen_cpl_handlers(void);
void t4_uninit_listen_cpl_handlers(void);
int t4_listen_start(struct toedev *, struct tcpcb *);
int t4_listen_stop(struct toedev *, struct tcpcb *);
void t4_syncache_added(struct toedev *, void *);
void t4_syncache_removed(struct toedev *, void *);
int t4_syncache_respond(struct toedev *, void *, struct mbuf *);
int do_abort_req_synqe(struct sge_iq *, const struct rss_header *,
struct mbuf *);
int do_abort_rpl_synqe(struct sge_iq *, const struct rss_header *,
struct mbuf *);
void t4_offload_socket(struct toedev *, void *, struct socket *);
-void synack_failure_cleanup(struct adapter *, int);
+void synack_failure_cleanup(struct adapter *, struct synq_entry *);
+int alloc_stid_tab(struct adapter *);
+void free_stid_tab(struct adapter *);
+void stop_stid_tab(struct adapter *);
+void restart_stid_tab(struct adapter *);
/* t4_cpl_io.c */
void aiotx_init_toep(struct toepcb *);
int t4_aio_queue_aiotx(struct socket *, struct kaiocb *);
void t4_init_cpl_io_handlers(void);
void t4_uninit_cpl_io_handlers(void);
void send_abort_rpl(struct adapter *, struct sge_ofld_txq *, int , int);
void send_flowc_wr(struct toepcb *, struct tcpcb *);
void send_reset(struct adapter *, struct toepcb *, uint32_t);
int send_rx_credits(struct adapter *, struct toepcb *, int);
void make_established(struct toepcb *, uint32_t, uint32_t, uint16_t);
int t4_close_conn(struct adapter *, struct toepcb *);
void t4_rcvd(struct toedev *, struct tcpcb *);
void t4_rcvd_locked(struct toedev *, struct tcpcb *);
int t4_tod_output(struct toedev *, struct tcpcb *);
int t4_send_fin(struct toedev *, struct tcpcb *);
int t4_send_rst(struct toedev *, struct tcpcb *);
void t4_set_tcb_field(struct adapter *, struct sge_wrq *, struct toepcb *,
uint16_t, uint64_t, uint64_t, int, int);
void t4_push_frames(struct adapter *, struct toepcb *, int);
void t4_push_pdus(struct adapter *, struct toepcb *, int);
/* t4_ddp.c */
int t4_init_ppod_region(struct ppod_region *, struct t4_range *, u_int,
const char *);
void t4_free_ppod_region(struct ppod_region *);
int t4_alloc_page_pods_for_ps(struct ppod_region *, struct pageset *);
int t4_alloc_page_pods_for_bio(struct ppod_region *, struct bio *,
struct ppod_reservation *);
int t4_alloc_page_pods_for_buf(struct ppod_region *, vm_offset_t, int,
struct ppod_reservation *);
int t4_alloc_page_pods_for_sgl(struct ppod_region *, struct ctl_sg_entry *, int,
struct ppod_reservation *);
int t4_write_page_pods_for_ps(struct adapter *, struct sge_wrq *, int,
struct pageset *);
int t4_write_page_pods_for_bio(struct adapter *, struct toepcb *,
struct ppod_reservation *, struct bio *, struct mbufq *);
int t4_write_page_pods_for_buf(struct adapter *, struct toepcb *,
struct ppod_reservation *, vm_offset_t, int, struct mbufq *);
int t4_write_page_pods_for_sgl(struct adapter *, struct toepcb *,
struct ppod_reservation *, struct ctl_sg_entry *, int, int, struct mbufq *);
void t4_free_page_pods(struct ppod_reservation *);
int t4_aio_queue_ddp(struct socket *, struct kaiocb *);
int t4_enable_ddp_rcv(struct socket *, struct toepcb *);
void t4_ddp_mod_load(void);
void t4_ddp_mod_unload(void);
void ddp_assert_empty(struct toepcb *);
void ddp_uninit_toep(struct toepcb *);
void ddp_queue_toep(struct toepcb *);
void release_ddp_resources(struct toepcb *toep);
void handle_ddp_close(struct toepcb *, struct tcpcb *, uint32_t);
void handle_ddp_indicate(struct toepcb *);
void insert_ddp_data(struct toepcb *, uint32_t);
const struct offload_settings *lookup_offload_policy(struct adapter *, int,
struct mbuf *, uint16_t, struct inpcb *);
/* t4_tls.c */
bool can_tls_offload(struct adapter *);
void do_rx_data_tls(const struct cpl_rx_data *, struct toepcb *, struct mbuf *);
void t4_push_ktls(struct adapter *, struct toepcb *, int);
void tls_received_starting_data(struct adapter *, struct toepcb *,
struct sockbuf *, int);
void t4_tls_mod_load(void);
void t4_tls_mod_unload(void);
void tls_init_toep(struct toepcb *);
int tls_tx_key(struct toepcb *);
void tls_uninit_toep(struct toepcb *);
int tls_alloc_ktls(struct toepcb *, struct ktls_session *, int);
#endif
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Sun, Dec 21, 3:44 PM (22 h, 26 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
27098514
Default Alt Text
(146 KB)
Attached To
Mode
rG FreeBSD src repository
Attached
Detach File
Event Timeline
Log In to Comment