diff --git a/sys/dev/cxgbe/t4_filter.c b/sys/dev/cxgbe/t4_filter.c index 92f882dd38cf..4b583b67ba07 100644 --- a/sys/dev/cxgbe/t4_filter.c +++ b/sys/dev/cxgbe/t4_filter.c @@ -1,2236 +1,2244 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2018 Chelsio Communications, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "common/t4_tcb.h" #include "t4_l2t.h" #include "t4_smt.h" struct filter_entry { LIST_ENTRY(filter_entry) link_4t; LIST_ENTRY(filter_entry) link_tid; uint32_t valid:1; /* filter allocated and valid */ uint32_t locked:1; /* filter is administratively locked or busy */ uint32_t pending:1; /* filter action is pending firmware reply */ int tid; /* tid of the filter TCB */ struct l2t_entry *l2te; /* L2 table entry for DMAC rewrite */ struct smt_entry *smt; /* SMT entry for SMAC rewrite */ struct t4_filter_specification fs; }; static void free_filter_resources(struct filter_entry *); static int get_tcamfilter(struct adapter *, struct t4_filter *); static int get_hashfilter(struct adapter *, struct t4_filter *); static int set_hashfilter(struct adapter *, struct t4_filter *, uint64_t, struct l2t_entry *, struct smt_entry *); static int del_hashfilter(struct adapter *, struct t4_filter *); static int configure_hashfilter_tcb(struct adapter *, struct filter_entry *); static inline bool separate_hpfilter_region(struct adapter *sc) { return (chip_id(sc) >= CHELSIO_T6); } static inline uint32_t hf_hashfn_4t(struct t4_filter_specification *fs) { struct t4_filter_tuple *ft = &fs->val; uint32_t hash; if (fs->type) { /* IPv6 */ hash = fnv_32_buf(&ft->sip[0], 16, FNV1_32_INIT); hash = fnv_32_buf(&ft->dip[0], 16, hash); } else { hash = fnv_32_buf(&ft->sip[0], 4, FNV1_32_INIT); hash = fnv_32_buf(&ft->dip[0], 4, hash); } hash = fnv_32_buf(&ft->sport, sizeof(ft->sport), hash); hash = fnv_32_buf(&ft->dport, sizeof(ft->dport), hash); return (hash); } static inline uint32_t hf_hashfn_tid(int tid) { return (fnv_32_buf(&tid, sizeof(tid), FNV1_32_INIT)); } static int alloc_hftid_hash(struct tid_info *t, int flags) { int n; MPASS(t->ntids > 0); MPASS(t->hftid_hash_4t == NULL); MPASS(t->hftid_hash_tid == NULL); n = max(t->ntids / 1024, 16); t->hftid_hash_4t = hashinit_flags(n, M_CXGBE, &t->hftid_4t_mask, flags); if (t->hftid_hash_4t == NULL) return (ENOMEM); t->hftid_hash_tid = hashinit_flags(n, M_CXGBE, &t->hftid_tid_mask, flags); if (t->hftid_hash_tid == NULL) { hashdestroy(t->hftid_hash_4t, M_CXGBE, t->hftid_4t_mask); t->hftid_hash_4t = NULL; return (ENOMEM); } mtx_init(&t->hftid_lock, "T4 hashfilters", 0, MTX_DEF); cv_init(&t->hftid_cv, "t4hfcv"); return (0); } void free_hftid_hash(struct tid_info *t) { struct filter_entry *f, *ftmp; LIST_HEAD(, filter_entry) *head; int i; #ifdef INVARIANTS int n = 0; #endif if (t->tids_in_use > 0) { /* Remove everything from the tid hash. */ head = t->hftid_hash_tid; for (i = 0; i <= t->hftid_tid_mask; i++) { LIST_FOREACH_SAFE(f, &head[i], link_tid, ftmp) { LIST_REMOVE(f, link_tid); } } /* Remove and then free each filter in the 4t hash. */ head = t->hftid_hash_4t; for (i = 0; i <= t->hftid_4t_mask; i++) { LIST_FOREACH_SAFE(f, &head[i], link_4t, ftmp) { #ifdef INVARIANTS n += f->fs.type ? 2 : 1; #endif LIST_REMOVE(f, link_4t); free(f, M_CXGBE); } } MPASS(t->tids_in_use == n); t->tids_in_use = 0; } if (t->hftid_hash_4t) { hashdestroy(t->hftid_hash_4t, M_CXGBE, t->hftid_4t_mask); t->hftid_hash_4t = NULL; } if (t->hftid_hash_tid) { hashdestroy(t->hftid_hash_tid, M_CXGBE, t->hftid_tid_mask); t->hftid_hash_tid = NULL; } if (mtx_initialized(&t->hftid_lock)) { mtx_destroy(&t->hftid_lock); cv_destroy(&t->hftid_cv); } } static void insert_hf(struct adapter *sc, struct filter_entry *f, uint32_t hash) { struct tid_info *t = &sc->tids; LIST_HEAD(, filter_entry) *head = t->hftid_hash_4t; MPASS(head != NULL); if (hash == 0) hash = hf_hashfn_4t(&f->fs); LIST_INSERT_HEAD(&head[hash & t->hftid_4t_mask], f, link_4t); atomic_add_int(&t->tids_in_use, f->fs.type ? 2 : 1); } static void insert_hftid(struct adapter *sc, struct filter_entry *f) { struct tid_info *t = &sc->tids; LIST_HEAD(, filter_entry) *head = t->hftid_hash_tid; uint32_t hash; MPASS(f->tid >= t->tid_base); MPASS(f->tid - t->tid_base < t->ntids); mtx_assert(&t->hftid_lock, MA_OWNED); hash = hf_hashfn_tid(f->tid); LIST_INSERT_HEAD(&head[hash & t->hftid_tid_mask], f, link_tid); } static bool filter_eq(struct t4_filter_specification *fs1, struct t4_filter_specification *fs2) { int n; MPASS(fs1->hash && fs2->hash); if (fs1->type != fs2->type) return (false); n = fs1->type ? 16 : 4; if (bcmp(&fs1->val.sip[0], &fs2->val.sip[0], n) || bcmp(&fs1->val.dip[0], &fs2->val.dip[0], n) || fs1->val.sport != fs2->val.sport || fs1->val.dport != fs2->val.dport) return (false); /* * We know the masks are the same because all hashfilters conform to the * global tp->filter_mask and the driver has verified that already. */ if ((fs1->mask.pfvf_vld || fs1->mask.ovlan_vld) && fs1->val.vnic != fs2->val.vnic) return (false); if (fs1->mask.vlan_vld && fs1->val.vlan != fs2->val.vlan) return (false); if (fs1->mask.macidx && fs1->val.macidx != fs2->val.macidx) return (false); if (fs1->mask.frag && fs1->val.frag != fs2->val.frag) return (false); if (fs1->mask.matchtype && fs1->val.matchtype != fs2->val.matchtype) return (false); if (fs1->mask.iport && fs1->val.iport != fs2->val.iport) return (false); if (fs1->mask.fcoe && fs1->val.fcoe != fs2->val.fcoe) return (false); if (fs1->mask.proto && fs1->val.proto != fs2->val.proto) return (false); if (fs1->mask.tos && fs1->val.tos != fs2->val.tos) return (false); if (fs1->mask.ethtype && fs1->val.ethtype != fs2->val.ethtype) return (false); return (true); } static struct filter_entry * lookup_hf(struct adapter *sc, struct t4_filter_specification *fs, uint32_t hash) { struct tid_info *t = &sc->tids; LIST_HEAD(, filter_entry) *head = t->hftid_hash_4t; struct filter_entry *f; mtx_assert(&t->hftid_lock, MA_OWNED); MPASS(head != NULL); if (hash == 0) hash = hf_hashfn_4t(fs); LIST_FOREACH(f, &head[hash & t->hftid_4t_mask], link_4t) { if (filter_eq(&f->fs, fs)) return (f); } return (NULL); } static struct filter_entry * lookup_hftid(struct adapter *sc, int tid) { struct tid_info *t = &sc->tids; LIST_HEAD(, filter_entry) *head = t->hftid_hash_tid; struct filter_entry *f; uint32_t hash; mtx_assert(&t->hftid_lock, MA_OWNED); MPASS(head != NULL); hash = hf_hashfn_tid(tid); LIST_FOREACH(f, &head[hash & t->hftid_tid_mask], link_tid) { if (f->tid == tid) return (f); } return (NULL); } static void remove_hf(struct adapter *sc, struct filter_entry *f) { struct tid_info *t = &sc->tids; mtx_assert(&t->hftid_lock, MA_OWNED); LIST_REMOVE(f, link_4t); atomic_subtract_int(&t->tids_in_use, f->fs.type ? 2 : 1); } static void remove_hftid(struct adapter *sc, struct filter_entry *f) { #ifdef INVARIANTS struct tid_info *t = &sc->tids; mtx_assert(&t->hftid_lock, MA_OWNED); #endif LIST_REMOVE(f, link_tid); } static uint16_t mode_to_fconf_t4(uint32_t mode) { uint32_t fconf = 0; if (mode & T4_FILTER_IP_FRAGMENT) fconf |= F_FRAGMENTATION; if (mode & T4_FILTER_MPS_HIT_TYPE) fconf |= F_MPSHITTYPE; if (mode & T4_FILTER_MAC_IDX) fconf |= F_MACMATCH; if (mode & T4_FILTER_ETH_TYPE) fconf |= F_ETHERTYPE; if (mode & T4_FILTER_IP_PROTO) fconf |= F_PROTOCOL; if (mode & T4_FILTER_IP_TOS) fconf |= F_TOS; if (mode & T4_FILTER_VLAN) fconf |= F_VLAN; if (mode & T4_FILTER_VNIC) fconf |= F_VNIC_ID; if (mode & T4_FILTER_PORT) fconf |= F_PORT; if (mode & T4_FILTER_FCoE) fconf |= F_FCOE; return (fconf); } static uint16_t mode_to_fconf_t7(uint32_t mode) { uint32_t fconf = 0; if (mode & T4_FILTER_TCPFLAGS) fconf |= F_TCPFLAGS; if (mode & T4_FILTER_SYNONLY) fconf |= F_SYNONLY; if (mode & T4_FILTER_ROCE) fconf |= F_ROCE; if (mode & T4_FILTER_IP_FRAGMENT) fconf |= F_T7_FRAGMENTATION; if (mode & T4_FILTER_MPS_HIT_TYPE) fconf |= F_T7_MPSHITTYPE; if (mode & T4_FILTER_MAC_IDX) fconf |= F_T7_MACMATCH; if (mode & T4_FILTER_ETH_TYPE) fconf |= F_T7_ETHERTYPE; if (mode & T4_FILTER_IP_PROTO) fconf |= F_T7_PROTOCOL; if (mode & T4_FILTER_IP_TOS) fconf |= F_T7_TOS; if (mode & T4_FILTER_VLAN) fconf |= F_T7_VLAN; if (mode & T4_FILTER_VNIC) fconf |= F_T7_VNIC_ID; if (mode & T4_FILTER_PORT) fconf |= F_T7_PORT; if (mode & T4_FILTER_FCoE) fconf |= F_T7_FCOE; if (mode & T4_FILTER_IPSECIDX) fconf |= F_IPSECIDX; return (fconf); } /* * Input: driver's 32b filter mode. * Returns: hardware filter mode (bits to set in vlan_pri_map) for the input. */ static uint16_t mode_to_fconf(struct adapter *sc, uint32_t mode) { if (chip_id(sc) >= CHELSIO_T7) return (mode_to_fconf_t7(mode)); else return (mode_to_fconf_t4(mode)); } /* * Input: driver's 32b filter mode. * Returns: hardware vnic mode (ingress config) matching the input. */ static int mode_to_iconf(uint32_t mode) { if ((mode & T4_FILTER_VNIC) == 0) return (-1); /* ingress config doesn't matter. */ if (mode & T4_FILTER_IC_VNIC) return (FW_VNIC_MODE_PF_VF); else if (mode & T4_FILTER_IC_ENCAP) return (FW_VNIC_MODE_ENCAP_EN); else return (FW_VNIC_MODE_OUTER_VLAN); } static int check_fspec_against_fconf_iconf(struct adapter *sc, struct t4_filter_specification *fs) { struct tp_params *tpp = &sc->params.tp; uint32_t fconf = 0; if (chip_id(sc) >= CHELSIO_T7) { if (fs->val.tcpflags || fs->mask.tcpflags) fconf |= F_TCPFLAGS; if (fs->val.synonly || fs->mask.synonly) fconf |= F_SYNONLY; if (fs->val.roce || fs->mask.roce) fconf |= F_ROCE; if (fs->val.frag || fs->mask.frag) fconf |= F_T7_FRAGMENTATION; if (fs->val.matchtype || fs->mask.matchtype) fconf |= F_T7_MPSHITTYPE; if (fs->val.macidx || fs->mask.macidx) fconf |= F_T7_MACMATCH; if (fs->val.ethtype || fs->mask.ethtype) fconf |= F_T7_ETHERTYPE; if (fs->val.proto || fs->mask.proto) fconf |= F_T7_PROTOCOL; if (fs->val.tos || fs->mask.tos) fconf |= F_T7_TOS; if (fs->val.vlan_vld || fs->mask.vlan_vld) fconf |= F_T7_VLAN; if (fs->val.ovlan_vld || fs->mask.ovlan_vld) { if (tpp->vnic_mode != FW_VNIC_MODE_OUTER_VLAN) return (EINVAL); fconf |= F_T7_VNIC_ID; } if (fs->val.pfvf_vld || fs->mask.pfvf_vld) { if (tpp->vnic_mode != FW_VNIC_MODE_PF_VF) return (EINVAL); fconf |= F_T7_VNIC_ID; } #ifdef notyet if (fs->val.encap_vld || fs->mask.encap_vld) { if (tpp->vnic_mode != FW_VNIC_MODE_ENCAP_EN); return (EINVAL); fconf |= F_T7_VNIC_ID; } #endif if (fs->val.iport || fs->mask.iport) fconf |= F_T7_PORT; if (fs->val.fcoe || fs->mask.fcoe) fconf |= F_T7_FCOE; if (fs->val.ipsecidx || fs->mask.ipsecidx) fconf |= F_IPSECIDX; } else { if (fs->val.tcpflags || fs->mask.tcpflags || fs->val.synonly || fs->mask.synonly || fs->val.roce || fs->mask.roce || fs->val.ipsecidx || fs->mask.ipsecidx) return (EINVAL); if (fs->val.frag || fs->mask.frag) fconf |= F_FRAGMENTATION; if (fs->val.matchtype || fs->mask.matchtype) fconf |= F_MPSHITTYPE; if (fs->val.macidx || fs->mask.macidx) fconf |= F_MACMATCH; if (fs->val.ethtype || fs->mask.ethtype) fconf |= F_ETHERTYPE; if (fs->val.proto || fs->mask.proto) fconf |= F_PROTOCOL; if (fs->val.tos || fs->mask.tos) fconf |= F_TOS; if (fs->val.vlan_vld || fs->mask.vlan_vld) fconf |= F_VLAN; if (fs->val.ovlan_vld || fs->mask.ovlan_vld) { if (tpp->vnic_mode != FW_VNIC_MODE_OUTER_VLAN) return (EINVAL); fconf |= F_VNIC_ID; } if (fs->val.pfvf_vld || fs->mask.pfvf_vld) { if (tpp->vnic_mode != FW_VNIC_MODE_PF_VF) return (EINVAL); fconf |= F_VNIC_ID; } #ifdef notyet if (fs->val.encap_vld || fs->mask.encap_vld) { if (tpp->vnic_mode != FW_VNIC_MODE_ENCAP_EN); return (EINVAL); fconf |= F_VNIC_ID; } #endif if (fs->val.iport || fs->mask.iport) fconf |= F_PORT; if (fs->val.fcoe || fs->mask.fcoe) fconf |= F_FCOE; } if ((tpp->filter_mode | fconf) != tpp->filter_mode) return (E2BIG); return (0); } static uint32_t fconf_to_mode_t4(uint16_t hwmode, int vnic_mode) { uint32_t mode = T4_FILTER_IPv4 | T4_FILTER_IPv6 | T4_FILTER_IP_SADDR | T4_FILTER_IP_DADDR | T4_FILTER_IP_SPORT | T4_FILTER_IP_DPORT; if (hwmode & F_FRAGMENTATION) mode |= T4_FILTER_IP_FRAGMENT; if (hwmode & F_MPSHITTYPE) mode |= T4_FILTER_MPS_HIT_TYPE; if (hwmode & F_MACMATCH) mode |= T4_FILTER_MAC_IDX; if (hwmode & F_ETHERTYPE) mode |= T4_FILTER_ETH_TYPE; if (hwmode & F_PROTOCOL) mode |= T4_FILTER_IP_PROTO; if (hwmode & F_TOS) mode |= T4_FILTER_IP_TOS; if (hwmode & F_VLAN) mode |= T4_FILTER_VLAN; if (hwmode & F_VNIC_ID) mode |= T4_FILTER_VNIC; /* real meaning depends on vnic_mode. */ if (hwmode & F_PORT) mode |= T4_FILTER_PORT; if (hwmode & F_FCOE) mode |= T4_FILTER_FCoE; switch (vnic_mode) { case FW_VNIC_MODE_PF_VF: mode |= T4_FILTER_IC_VNIC; break; case FW_VNIC_MODE_ENCAP_EN: mode |= T4_FILTER_IC_ENCAP; break; case FW_VNIC_MODE_OUTER_VLAN: default: break; } return (mode); } static uint32_t fconf_to_mode_t7(uint16_t hwmode, int vnic_mode) { uint32_t mode = T4_FILTER_IPv4 | T4_FILTER_IPv6 | T4_FILTER_IP_SADDR | T4_FILTER_IP_DADDR | T4_FILTER_IP_SPORT | T4_FILTER_IP_DPORT; if (hwmode & F_TCPFLAGS) mode |= T4_FILTER_TCPFLAGS; if (hwmode & F_SYNONLY) mode |= T4_FILTER_SYNONLY; if (hwmode & F_ROCE) mode |= T4_FILTER_ROCE; if (hwmode & F_T7_FRAGMENTATION) mode |= T4_FILTER_IP_FRAGMENT; if (hwmode & F_T7_MPSHITTYPE) mode |= T4_FILTER_MPS_HIT_TYPE; if (hwmode & F_T7_MACMATCH) mode |= T4_FILTER_MAC_IDX; if (hwmode & F_T7_ETHERTYPE) mode |= T4_FILTER_ETH_TYPE; if (hwmode & F_T7_PROTOCOL) mode |= T4_FILTER_IP_PROTO; if (hwmode & F_T7_TOS) mode |= T4_FILTER_IP_TOS; if (hwmode & F_T7_VLAN) mode |= T4_FILTER_VLAN; if (hwmode & F_T7_VNIC_ID) mode |= T4_FILTER_VNIC; /* real meaning depends on vnic_mode. */ if (hwmode & F_T7_PORT) mode |= T4_FILTER_PORT; if (hwmode & F_T7_FCOE) mode |= T4_FILTER_FCoE; if (hwmode & F_IPSECIDX) mode |= T4_FILTER_IPSECIDX; switch (vnic_mode) { case FW_VNIC_MODE_PF_VF: mode |= T4_FILTER_IC_VNIC; break; case FW_VNIC_MODE_ENCAP_EN: mode |= T4_FILTER_IC_ENCAP; break; case FW_VNIC_MODE_OUTER_VLAN: default: break; } return (mode); } /* * Input: hardware filter configuration (filter mode/mask, ingress config). * Output: driver's 32b filter mode matching the input. */ static inline uint32_t fconf_to_mode(struct adapter *sc, uint16_t hwmode, int vnic_mode) { if (chip_id(sc) >= CHELSIO_T7) return (fconf_to_mode_t7(hwmode, vnic_mode)); else return (fconf_to_mode_t4(hwmode, vnic_mode)); } int get_filter_mode(struct adapter *sc, uint32_t *mode) { struct tp_params *tp = &sc->params.tp; uint16_t filter_mode; /* Filter mask must comply with the global filter mode. */ MPASS((tp->filter_mode | tp->filter_mask) == tp->filter_mode); /* Non-zero incoming value in mode means "hashfilter mode". */ filter_mode = *mode ? tp->filter_mask : tp->filter_mode; *mode = fconf_to_mode(sc, filter_mode, tp->vnic_mode); return (0); } int set_filter_mode(struct adapter *sc, uint32_t mode) { struct tp_params *tp = &sc->params.tp; int rc, iconf; uint16_t fconf; iconf = mode_to_iconf(mode); fconf = mode_to_fconf(sc, mode); if ((iconf == -1 || iconf == tp->vnic_mode) && fconf == tp->filter_mode) return (0); /* Nothing to do */ rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4setfm"); if (rc) return (rc); if (!hw_all_ok(sc)) { rc = ENXIO; goto done; } if (sc->tids.ftids_in_use > 0 || /* TCAM filters active */ sc->tids.hpftids_in_use > 0 || /* hi-pri TCAM filters active */ sc->tids.tids_in_use > 0) { /* TOE or hashfilters active */ rc = EBUSY; goto done; } #ifdef TCP_OFFLOAD if (uld_active(sc, ULD_TOM)) { rc = EBUSY; goto done; } #endif /* Note that filter mask will get clipped to the new filter mode. */ rc = -t4_set_filter_cfg(sc, fconf, -1, iconf); done: end_synchronized_op(sc, 0); return (rc); } int set_filter_mask(struct adapter *sc, uint32_t mode) { struct tp_params *tp = &sc->params.tp; int rc, iconf; uint16_t fmask; iconf = mode_to_iconf(mode); fmask = mode_to_fconf(sc, mode); if ((iconf == -1 || iconf == tp->vnic_mode) && fmask == tp->filter_mask) return (0); /* Nothing to do */ /* * We aren't going to change the global filter mode or VNIC mode here. * The given filter mask must conform to them. */ if ((fmask | tp->filter_mode) != tp->filter_mode) return (EINVAL); if (iconf != -1 && iconf != tp->vnic_mode) return (EINVAL); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4sethfm"); if (rc) return (rc); if (!hw_all_ok(sc)) { rc = ENXIO; goto done; } if (sc->tids.tids_in_use > 0) { /* TOE or hashfilters active */ rc = EBUSY; goto done; } #ifdef TCP_OFFLOAD if (uld_active(sc, ULD_TOM)) { rc = EBUSY; goto done; } #endif rc = -t4_set_filter_cfg(sc, -1, fmask, -1); done: end_synchronized_op(sc, 0); return (rc); } static inline uint64_t get_filter_hits(struct adapter *sc, uint32_t tid) { uint32_t tcb_addr; uint64_t hits; tcb_addr = t4_read_reg(sc, A_TP_CMM_TCB_BASE) + tid * TCB_SIZE; mtx_lock(&sc->reg_lock); if (!hw_all_ok(sc)) hits = 0; else if (is_t4(sc)) { uint64_t t; read_via_memwin(sc, 0, tcb_addr + 16, (uint32_t *)&t, 8); hits = be64toh(t); } else { uint32_t t; read_via_memwin(sc, 0, tcb_addr + 24, &t, 4); hits = be32toh(t); } mtx_unlock(&sc->reg_lock); return (hits); } int get_filter(struct adapter *sc, struct t4_filter *t) { if (t->fs.hash) return (get_hashfilter(sc, t)); else return (get_tcamfilter(sc, t)); } static int set_tcamfilter(struct adapter *sc, struct t4_filter *t, struct l2t_entry *l2te, struct smt_entry *smt) { struct filter_entry *f; struct fw_filter2_wr *fwr; u_int vnic_vld, vnic_vld_mask; struct wrq_cookie cookie; int i, rc, busy, locked; u_int tid; const int ntids = t->fs.type ? 4 : 1; MPASS(!t->fs.hash); /* Already validated against fconf, iconf */ MPASS((t->fs.val.pfvf_vld & t->fs.val.ovlan_vld) == 0); MPASS((t->fs.mask.pfvf_vld & t->fs.mask.ovlan_vld) == 0); if (separate_hpfilter_region(sc) && t->fs.prio) { MPASS(t->idx < sc->tids.nhpftids); f = &sc->tids.hpftid_tab[t->idx]; tid = sc->tids.hpftid_base + t->idx; } else { MPASS(t->idx < sc->tids.nftids); f = &sc->tids.ftid_tab[t->idx]; tid = sc->tids.ftid_base + t->idx; } rc = busy = locked = 0; mtx_lock(&sc->tids.ftid_lock); for (i = 0; i < ntids; i++) { busy += f[i].pending + f[i].valid; locked += f[i].locked; } if (locked > 0) rc = EPERM; else if (busy > 0) rc = EBUSY; else { int len16; if (sc->params.filter2_wr_support) len16 = howmany(sizeof(struct fw_filter2_wr), 16); else len16 = howmany(sizeof(struct fw_filter_wr), 16); fwr = start_wrq_wr(&sc->sge.ctrlq[0], len16, &cookie); if (__predict_false(fwr == NULL)) rc = ENOMEM; else { f->pending = 1; if (separate_hpfilter_region(sc) && t->fs.prio) sc->tids.hpftids_in_use++; else sc->tids.ftids_in_use++; } } mtx_unlock(&sc->tids.ftid_lock); if (rc != 0) return (rc); /* * Can't fail now. A set-filter WR will definitely be sent. */ f->tid = tid; f->fs = t->fs; f->l2te = l2te; f->smt = smt; if (t->fs.val.pfvf_vld || t->fs.val.ovlan_vld) vnic_vld = 1; else vnic_vld = 0; if (t->fs.mask.pfvf_vld || t->fs.mask.ovlan_vld) vnic_vld_mask = 1; else vnic_vld_mask = 0; bzero(fwr, sizeof(*fwr)); if (sc->params.filter2_wr_support) fwr->op_pkd = htobe32(V_FW_WR_OP(FW_FILTER2_WR)); else fwr->op_pkd = htobe32(V_FW_WR_OP(FW_FILTER_WR)); fwr->len16_pkd = htobe32(FW_LEN16(*fwr)); fwr->tid_to_iq = htobe32(V_FW_FILTER_WR_TID(f->tid) | V_FW_FILTER_WR_RQTYPE(f->fs.type) | V_FW_FILTER_WR_NOREPLY(0) | V_FW_FILTER_WR_IQ(f->fs.iq)); fwr->del_filter_to_l2tix = htobe32(V_FW_FILTER_WR_RPTTID(f->fs.rpttid) | V_FW_FILTER_WR_DROP(f->fs.action == FILTER_DROP) | V_FW_FILTER_WR_DIRSTEER(f->fs.dirsteer) | V_FW_FILTER_WR_MASKHASH(f->fs.maskhash) | V_FW_FILTER_WR_DIRSTEERHASH(f->fs.dirsteerhash) | V_FW_FILTER_WR_LPBK(f->fs.action == FILTER_SWITCH) | V_FW_FILTER_WR_DMAC(f->fs.newdmac) | V_FW_FILTER_WR_SMAC(f->fs.newsmac) | V_FW_FILTER_WR_INSVLAN(f->fs.newvlan == VLAN_INSERT || f->fs.newvlan == VLAN_REWRITE) | V_FW_FILTER_WR_RMVLAN(f->fs.newvlan == VLAN_REMOVE || f->fs.newvlan == VLAN_REWRITE) | V_FW_FILTER_WR_HITCNTS(f->fs.hitcnts) | V_FW_FILTER_WR_TXCHAN(f->fs.eport) | V_FW_FILTER_WR_PRIO(f->fs.prio) | V_FW_FILTER_WR_L2TIX(f->l2te ? f->l2te->idx : 0)); fwr->ethtype = htobe16(f->fs.val.ethtype); fwr->ethtypem = htobe16(f->fs.mask.ethtype); fwr->frag_to_ovlan_vldm = (V_FW_FILTER_WR_FRAG(f->fs.val.frag) | V_FW_FILTER_WR_FRAGM(f->fs.mask.frag) | V_FW_FILTER_WR_IVLAN_VLD(f->fs.val.vlan_vld) | V_FW_FILTER_WR_OVLAN_VLD(vnic_vld) | V_FW_FILTER_WR_IVLAN_VLDM(f->fs.mask.vlan_vld) | V_FW_FILTER_WR_OVLAN_VLDM(vnic_vld_mask)); fwr->smac_sel = 0; fwr->rx_chan_rx_rpl_iq = htobe16(V_FW_FILTER_WR_RX_CHAN(0) | V_FW_FILTER_WR_RX_RPL_IQ(sc->sge.fwq.abs_id)); fwr->maci_to_matchtypem = htobe32(V_FW_FILTER_WR_MACI(f->fs.val.macidx) | V_FW_FILTER_WR_MACIM(f->fs.mask.macidx) | V_FW_FILTER_WR_FCOE(f->fs.val.fcoe) | V_FW_FILTER_WR_FCOEM(f->fs.mask.fcoe) | V_FW_FILTER_WR_PORT(f->fs.val.iport) | V_FW_FILTER_WR_PORTM(f->fs.mask.iport) | V_FW_FILTER_WR_MATCHTYPE(f->fs.val.matchtype) | V_FW_FILTER_WR_MATCHTYPEM(f->fs.mask.matchtype)); fwr->ptcl = f->fs.val.proto; fwr->ptclm = f->fs.mask.proto; fwr->ttyp = f->fs.val.tos; fwr->ttypm = f->fs.mask.tos; fwr->ivlan = htobe16(f->fs.val.vlan); fwr->ivlanm = htobe16(f->fs.mask.vlan); fwr->ovlan = htobe16(f->fs.val.vnic); fwr->ovlanm = htobe16(f->fs.mask.vnic); bcopy(f->fs.val.dip, fwr->lip, sizeof (fwr->lip)); bcopy(f->fs.mask.dip, fwr->lipm, sizeof (fwr->lipm)); bcopy(f->fs.val.sip, fwr->fip, sizeof (fwr->fip)); bcopy(f->fs.mask.sip, fwr->fipm, sizeof (fwr->fipm)); fwr->lp = htobe16(f->fs.val.dport); fwr->lpm = htobe16(f->fs.mask.dport); fwr->fp = htobe16(f->fs.val.sport); fwr->fpm = htobe16(f->fs.mask.sport); /* sma = 0 tells the fw to use SMAC_SEL for source MAC address */ bzero(fwr->sma, sizeof (fwr->sma)); if (sc->params.filter2_wr_support) { fwr->filter_type_swapmac = V_FW_FILTER2_WR_SWAPMAC(f->fs.swapmac); fwr->natmode_to_ulp_type = V_FW_FILTER2_WR_ULP_TYPE(f->fs.nat_mode ? ULP_MODE_TCPDDP : ULP_MODE_NONE) | V_FW_FILTER2_WR_NATFLAGCHECK(f->fs.nat_flag_chk) | V_FW_FILTER2_WR_NATMODE(f->fs.nat_mode); memcpy(fwr->newlip, f->fs.nat_dip, sizeof(fwr->newlip)); memcpy(fwr->newfip, f->fs.nat_sip, sizeof(fwr->newfip)); fwr->newlport = htobe16(f->fs.nat_dport); fwr->newfport = htobe16(f->fs.nat_sport); fwr->natseqcheck = htobe32(f->fs.nat_seq_chk); } commit_wrq_wr(&sc->sge.ctrlq[0], fwr, &cookie); /* Wait for response. */ mtx_lock(&sc->tids.ftid_lock); for (;;) { if (f->pending == 0) { rc = f->valid ? 0 : EIO; break; } if (cv_wait_sig(&sc->tids.ftid_cv, &sc->tids.ftid_lock) != 0) { rc = EINPROGRESS; break; } } mtx_unlock(&sc->tids.ftid_lock); return (rc); } static int hashfilter_ntuple(struct adapter *sc, const struct t4_filter_specification *fs, uint64_t *ftuple) { struct tp_params *tp = &sc->params.tp; uint16_t fmask; /* * Initialize each of the fields which we care about which are present * in the Compressed Filter Tuple. */ #define SFF(V, S) ((uint64_t)(V) << S) /* Shifted Filter Field. */ *ftuple = fmask = 0; if (chip_id(sc) >= CHELSIO_T7) { if (tp->ipsecidx_shift >= 0 && fs->mask.ipsecidx) { *ftuple |= SFF(fs->val.ipsecidx, tp->ipsecidx_shift); fmask |= F_IPSECIDX; } if (tp->fcoe_shift >= 0 && fs->mask.fcoe) { *ftuple |= SFF(fs->val.fcoe, tp->fcoe_shift); fmask |= F_T7_FCOE; } if (tp->port_shift >= 0 && fs->mask.iport) { *ftuple |= (uint64_t)fs->val.iport << tp->port_shift; fmask |= F_T7_PORT; } if (tp->vnic_shift >= 0 && fs->mask.vnic) { /* vnic_mode was already validated. */ if (tp->vnic_mode == FW_VNIC_MODE_PF_VF) MPASS(fs->mask.pfvf_vld); else if (tp->vnic_mode == FW_VNIC_MODE_OUTER_VLAN) MPASS(fs->mask.ovlan_vld); #ifdef notyet else if (tp->vnic_mode == FW_VNIC_MODE_ENCAP_EN) MPASS(fs->mask.encap_vld); #endif *ftuple |= SFF(F_FT_VNID_ID_VLD | fs->val.vnic, tp->vnic_shift); fmask |= F_T7_VNIC_ID; } if (tp->vlan_shift >= 0 && fs->mask.vlan) { *ftuple |= SFF(F_FT_VLAN_VLD | fs->val.vlan, tp->vlan_shift); fmask |= F_T7_VLAN; } if (tp->tos_shift >= 0 && fs->mask.tos) { *ftuple |= SFF(fs->val.tos, tp->tos_shift); fmask |= F_T7_TOS; } if (tp->protocol_shift >= 0 && fs->mask.proto) { *ftuple |= SFF(fs->val.proto, tp->protocol_shift); fmask |= F_T7_PROTOCOL; } if (tp->ethertype_shift >= 0 && fs->mask.ethtype) { *ftuple |= SFF(fs->val.ethtype, tp->ethertype_shift); fmask |= F_T7_ETHERTYPE; } if (tp->macmatch_shift >= 0 && fs->mask.macidx) { *ftuple |= SFF(fs->val.macidx, tp->macmatch_shift); fmask |= F_T7_MACMATCH; } if (tp->matchtype_shift >= 0 && fs->mask.matchtype) { *ftuple |= SFF(fs->val.matchtype, tp->matchtype_shift); fmask |= F_T7_MPSHITTYPE; } if (tp->frag_shift >= 0 && fs->mask.frag) { *ftuple |= SFF(fs->val.frag, tp->frag_shift); fmask |= F_T7_FRAGMENTATION; } if (tp->roce_shift >= 0 && fs->mask.roce) { *ftuple |= SFF(fs->val.roce, tp->roce_shift); fmask |= F_ROCE; } if (tp->synonly_shift >= 0 && fs->mask.synonly) { *ftuple |= SFF(fs->val.synonly, tp->synonly_shift); fmask |= F_SYNONLY; } if (tp->tcpflags_shift >= 0 && fs->mask.tcpflags) { *ftuple |= SFF(fs->val.tcpflags, tp->synonly_shift); fmask |= F_TCPFLAGS; } } else { if (fs->mask.ipsecidx || fs->mask.roce || fs->mask.synonly || fs->mask.tcpflags) { MPASS(tp->ipsecidx_shift == -1); MPASS(tp->roce_shift == -1); MPASS(tp->synonly_shift == -1); MPASS(tp->tcpflags_shift == -1); return (EINVAL); } if (tp->fcoe_shift >= 0 && fs->mask.fcoe) { *ftuple |= SFF(fs->val.fcoe, tp->fcoe_shift); fmask |= F_FCOE; } if (tp->port_shift >= 0 && fs->mask.iport) { *ftuple |= (uint64_t)fs->val.iport << tp->port_shift; fmask |= F_PORT; } if (tp->vnic_shift >= 0 && fs->mask.vnic) { /* vnic_mode was already validated. */ if (tp->vnic_mode == FW_VNIC_MODE_PF_VF) MPASS(fs->mask.pfvf_vld); else if (tp->vnic_mode == FW_VNIC_MODE_OUTER_VLAN) MPASS(fs->mask.ovlan_vld); #ifdef notyet else if (tp->vnic_mode == FW_VNIC_MODE_ENCAP_EN) MPASS(fs->mask.encap_vld); #endif *ftuple |= SFF(F_FT_VNID_ID_VLD | fs->val.vnic, tp->vnic_shift); fmask |= F_VNIC_ID; } if (tp->vlan_shift >= 0 && fs->mask.vlan) { *ftuple |= SFF(F_FT_VLAN_VLD | fs->val.vlan, tp->vlan_shift); fmask |= F_VLAN; } if (tp->tos_shift >= 0 && fs->mask.tos) { *ftuple |= SFF(fs->val.tos, tp->tos_shift); fmask |= F_TOS; } if (tp->protocol_shift >= 0 && fs->mask.proto) { *ftuple |= SFF(fs->val.proto, tp->protocol_shift); fmask |= F_PROTOCOL; } if (tp->ethertype_shift >= 0 && fs->mask.ethtype) { *ftuple |= SFF(fs->val.ethtype, tp->ethertype_shift); fmask |= F_ETHERTYPE; } if (tp->macmatch_shift >= 0 && fs->mask.macidx) { *ftuple |= SFF(fs->val.macidx, tp->macmatch_shift); fmask |= F_MACMATCH; } if (tp->matchtype_shift >= 0 && fs->mask.matchtype) { *ftuple |= SFF(fs->val.matchtype, tp->matchtype_shift); fmask |= F_MPSHITTYPE; } if (tp->frag_shift >= 0 && fs->mask.frag) { *ftuple |= SFF(fs->val.frag, tp->frag_shift); fmask |= F_FRAGMENTATION; } } #undef SFF /* A hashfilter must conform to the hardware filter mask. */ if (fmask != tp->filter_mask) return (EINVAL); return (0); } static bool is_4tuple_specified(struct t4_filter_specification *fs) { int i; const int n = fs->type ? 16 : 4; if (fs->mask.sport != 0xffff || fs->mask.dport != 0xffff) return (false); for (i = 0; i < n; i++) { if (fs->mask.sip[i] != 0xff) return (false); if (fs->mask.dip[i] != 0xff) return (false); } return (true); } int set_filter(struct adapter *sc, struct t4_filter *t) { struct tid_info *ti = &sc->tids; struct l2t_entry *l2te = NULL; struct smt_entry *smt = NULL; uint64_t ftuple; int rc; /* * Basic filter checks first. */ if (t->fs.hash) { if (!is_hashfilter(sc) || ti->ntids == 0) return (ENOTSUP); /* Hardware, not user, selects a tid for hashfilters. */ if (t->idx != (uint32_t)-1) return (EINVAL); /* T5 can't count hashfilter hits. */ if (is_t5(sc) && t->fs.hitcnts) return (EINVAL); if (!is_4tuple_specified(&t->fs)) return (EINVAL); rc = hashfilter_ntuple(sc, &t->fs, &ftuple); if (rc != 0) return (rc); } else { if (separate_hpfilter_region(sc) && t->fs.prio) { if (ti->nhpftids == 0) return (ENOTSUP); if (t->idx >= ti->nhpftids) return (EINVAL); } else { if (ti->nftids == 0) return (ENOTSUP); if (t->idx >= ti->nftids) return (EINVAL); } /* IPv6 filter idx must be 4 aligned */ if (t->fs.type == 1 && ((t->idx & 0x3) || t->idx + 4 >= ti->nftids)) return (EINVAL); } /* T4 doesn't support VLAN tag removal or rewrite, swapmac, and NAT. */ if (is_t4(sc) && t->fs.action == FILTER_SWITCH && (t->fs.newvlan == VLAN_REMOVE || t->fs.newvlan == VLAN_REWRITE || t->fs.swapmac || t->fs.nat_mode)) return (ENOTSUP); if (t->fs.action == FILTER_SWITCH && t->fs.eport >= sc->params.nports) return (EINVAL); if (t->fs.val.iport >= sc->params.nports) return (EINVAL); /* Can't specify an iqid/rss_info if not steering. */ if (!t->fs.dirsteer && !t->fs.dirsteerhash && !t->fs.maskhash && t->fs.iq) return (EINVAL); /* Validate against the global filter mode and ingress config */ rc = check_fspec_against_fconf_iconf(sc, &t->fs); if (rc != 0) return (rc); /* * Basic checks passed. Make sure the queues and tid tables are setup. */ rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4setf"); if (rc) return (rc); if (!hw_all_ok(sc)) { rc = ENXIO; goto done; } if (!(sc->flags & FULL_INIT_DONE) && ((rc = adapter_init(sc)) != 0)) goto done; if (t->fs.hash) { if (__predict_false(ti->hftid_hash_4t == NULL)) { rc = alloc_hftid_hash(&sc->tids, HASH_NOWAIT); if (rc != 0) goto done; } } else if (separate_hpfilter_region(sc) && t->fs.prio && __predict_false(ti->hpftid_tab == NULL)) { MPASS(ti->nhpftids != 0); KASSERT(ti->hpftids_in_use == 0, ("%s: no memory allocated but hpftids_in_use is %u", __func__, ti->hpftids_in_use)); ti->hpftid_tab = malloc(sizeof(struct filter_entry) * ti->nhpftids, M_CXGBE, M_NOWAIT | M_ZERO); if (ti->hpftid_tab == NULL) { rc = ENOMEM; goto done; } if (!mtx_initialized(&sc->tids.ftid_lock)) { mtx_init(&ti->ftid_lock, "T4 filters", 0, MTX_DEF); cv_init(&ti->ftid_cv, "t4fcv"); } } else if (__predict_false(ti->ftid_tab == NULL)) { MPASS(ti->nftids != 0); KASSERT(ti->ftids_in_use == 0, ("%s: no memory allocated but ftids_in_use is %u", __func__, ti->ftids_in_use)); ti->ftid_tab = malloc(sizeof(struct filter_entry) * ti->nftids, M_CXGBE, M_NOWAIT | M_ZERO); if (ti->ftid_tab == NULL) { rc = ENOMEM; goto done; } if (!mtx_initialized(&sc->tids.ftid_lock)) { mtx_init(&ti->ftid_lock, "T4 filters", 0, MTX_DEF); cv_init(&ti->ftid_cv, "t4fcv"); } } done: end_synchronized_op(sc, 0); if (rc != 0) return (rc); /* * Allocate L2T entry, SMT entry, etc. */ if (t->fs.newdmac || t->fs.newvlan) { /* This filter needs an L2T entry; allocate one. */ l2te = t4_l2t_alloc_switching(sc, t->fs.vlan, t->fs.eport, t->fs.dmac); if (__predict_false(l2te == NULL)) { rc = EAGAIN; goto error; } } if (t->fs.newsmac) { /* This filter needs an SMT entry; allocate one. */ smt = t4_smt_alloc_switching(sc->smt, t->fs.smac); if (__predict_false(smt == NULL)) { rc = EAGAIN; goto error; } rc = t4_smt_set_switching(sc, smt, 0x0, t->fs.smac); if (rc) goto error; } if (t->fs.hash) rc = set_hashfilter(sc, t, ftuple, l2te, smt); else rc = set_tcamfilter(sc, t, l2te, smt); if (rc != 0 && rc != EINPROGRESS) { error: if (l2te) t4_l2t_release(l2te); if (smt) t4_smt_release(smt); } return (rc); } static int del_tcamfilter(struct adapter *sc, struct t4_filter *t) { struct filter_entry *f; struct fw_filter_wr *fwr; struct wrq_cookie cookie; int rc, nfilters; #ifdef INVARIANTS u_int tid_base; #endif mtx_lock(&sc->tids.ftid_lock); if (separate_hpfilter_region(sc) && t->fs.prio) { nfilters = sc->tids.nhpftids; f = sc->tids.hpftid_tab; #ifdef INVARIANTS tid_base = sc->tids.hpftid_base; #endif } else { nfilters = sc->tids.nftids; f = sc->tids.ftid_tab; #ifdef INVARIANTS tid_base = sc->tids.ftid_base; #endif } MPASS(f != NULL); /* Caller checked this. */ if (t->idx >= nfilters) { rc = EINVAL; goto done; } f += t->idx; if (f->locked) { rc = EPERM; goto done; } if (f->pending) { rc = EBUSY; goto done; } if (f->valid == 0) { rc = EINVAL; goto done; } MPASS(f->tid == tid_base + t->idx); fwr = start_wrq_wr(&sc->sge.ctrlq[0], howmany(sizeof(*fwr), 16), &cookie); if (fwr == NULL) { rc = ENOMEM; goto done; } bzero(fwr, sizeof (*fwr)); t4_mk_filtdelwr(f->tid, fwr, sc->sge.fwq.abs_id); f->pending = 1; commit_wrq_wr(&sc->sge.ctrlq[0], fwr, &cookie); t->fs = f->fs; /* extra info for the caller */ for (;;) { if (f->pending == 0) { rc = f->valid ? EIO : 0; break; } if (cv_wait_sig(&sc->tids.ftid_cv, &sc->tids.ftid_lock) != 0) { rc = EINPROGRESS; break; } } done: mtx_unlock(&sc->tids.ftid_lock); return (rc); } int del_filter(struct adapter *sc, struct t4_filter *t) { /* No filters possible if not initialized yet. */ if (!(sc->flags & FULL_INIT_DONE)) return (EINVAL); /* * The checks for tid tables ensure that the locks that del_* will reach * for are initialized. */ if (t->fs.hash) { if (sc->tids.hftid_hash_4t != NULL) return (del_hashfilter(sc, t)); } else if (separate_hpfilter_region(sc) && t->fs.prio) { if (sc->tids.hpftid_tab != NULL) return (del_tcamfilter(sc, t)); } else { if (sc->tids.ftid_tab != NULL) return (del_tcamfilter(sc, t)); } return (EINVAL); } /* * Release secondary resources associated with the filter. */ static void free_filter_resources(struct filter_entry *f) { if (f->l2te) { t4_l2t_release(f->l2te); f->l2te = NULL; } if (f->smt) { t4_smt_release(f->smt); f->smt = NULL; } } static int set_tcb_field(struct adapter *sc, u_int tid, uint16_t word, uint64_t mask, uint64_t val, int no_reply) { struct wrq_cookie cookie; struct cpl_set_tcb_field *req; req = start_wrq_wr(&sc->sge.ctrlq[0], howmany(sizeof(*req), 16), &cookie); if (req == NULL) return (ENOMEM); bzero(req, sizeof(*req)); INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, tid); - if (no_reply == 0) { - req->reply_ctrl = htobe16(V_QUEUENO(sc->sge.fwq.abs_id) | - V_NO_REPLY(0)); - } else - req->reply_ctrl = htobe16(V_NO_REPLY(1)); + if (no_reply) { + req->reply_ctrl = htobe16(F_NO_REPLY); + } else { + const int qid = sc->sge.fwq.abs_id; + + if (chip_id(sc) >= CHELSIO_T7) { + req->reply_ctrl = htobe16(V_T7_QUEUENO(qid) | + V_T7_REPLY_CHAN(0) | V_NO_REPLY(0)); + } else { + req->reply_ctrl = htobe16(V_QUEUENO(qid) | + V_REPLY_CHAN(0) | V_NO_REPLY(0)); + } + } req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(CPL_COOKIE_HASHFILTER)); req->mask = htobe64(mask); req->val = htobe64(val); commit_wrq_wr(&sc->sge.ctrlq[0], req, &cookie); return (0); } /* Set one of the t_flags bits in the TCB. */ static inline int set_tcb_tflag(struct adapter *sc, int tid, u_int bit_pos, u_int val, u_int no_reply) { return (set_tcb_field(sc, tid, W_TCB_T_FLAGS, 1ULL << bit_pos, (uint64_t)val << bit_pos, no_reply)); } int t4_filter_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_set_tcb_rpl *rpl = (const void *)(rss + 1); u_int tid = GET_TID(rpl); u_int rc, idx; struct filter_entry *f; KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__, rss->opcode)); if (is_hpftid(sc, tid)) { idx = tid - sc->tids.hpftid_base; f = &sc->tids.hpftid_tab[idx]; } else if (is_ftid(sc, tid)) { idx = tid - sc->tids.ftid_base; f = &sc->tids.ftid_tab[idx]; } else panic("%s: FW reply for invalid TID %d.", __func__, tid); MPASS(f->tid == tid); rc = G_COOKIE(rpl->cookie); mtx_lock(&sc->tids.ftid_lock); KASSERT(f->pending, ("%s: reply %d for filter[%u] that isn't pending.", __func__, rc, tid)); switch(rc) { case FW_FILTER_WR_FLT_ADDED: /* set-filter succeeded */ f->valid = 1; if (f->fs.newsmac) { MPASS(f->smt != NULL); set_tcb_tflag(sc, f->tid, S_TF_CCTRL_CWR, 1, 1); set_tcb_field(sc, f->tid, W_TCB_SMAC_SEL, V_TCB_SMAC_SEL(M_TCB_SMAC_SEL), V_TCB_SMAC_SEL(f->smt->idx), 1); /* XXX: wait for reply to TCB update before !pending */ } break; case FW_FILTER_WR_FLT_DELETED: /* del-filter succeeded */ MPASS(f->valid == 1); f->valid = 0; /* Fall through */ case FW_FILTER_WR_SMT_TBL_FULL: /* set-filter failed due to lack of SMT space. */ MPASS(f->valid == 0); free_filter_resources(f); if (separate_hpfilter_region(sc) && f->fs.prio) sc->tids.hpftids_in_use--; else sc->tids.ftids_in_use--; break; case FW_FILTER_WR_SUCCESS: case FW_FILTER_WR_EINVAL: default: panic("%s: unexpected reply %d for filter[%d].", __func__, rc, idx); } f->pending = 0; cv_broadcast(&sc->tids.ftid_cv); mtx_unlock(&sc->tids.ftid_lock); return (0); } /* * This is the reply to the Active Open that created the filter. Additional TCB * updates may be required to complete the filter configuration. */ int t4_hashfilter_ao_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_act_open_rpl *cpl = (const void *)(rss + 1); u_int atid = G_TID_TID(G_AOPEN_ATID(be32toh(cpl->atid_status))); u_int status = G_AOPEN_STATUS(be32toh(cpl->atid_status)); struct filter_entry *f = lookup_atid(sc, atid); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); mtx_lock(&sc->tids.hftid_lock); KASSERT(f->pending, ("%s: hashfilter[%p] isn't pending.", __func__, f)); KASSERT(f->tid == -1, ("%s: hashfilter[%p] has tid %d already.", __func__, f, f->tid)); if (status == CPL_ERR_NONE) { f->tid = GET_TID(cpl); MPASS(lookup_hftid(sc, f->tid) == NULL); insert_hftid(sc, f); /* * Leave the filter pending until it is fully set up, which will * be indicated by the reply to the last TCB update. No need to * unblock the ioctl thread either. */ if (configure_hashfilter_tcb(sc, f) == EINPROGRESS) goto done; f->valid = 1; f->pending = 0; } else { /* provide errno instead of tid to ioctl */ f->tid = act_open_rpl_status_to_errno(status); f->valid = 0; f->pending = 0; if (act_open_has_tid(status)) release_tid(sc, GET_TID(cpl), &sc->sge.ctrlq[0]); free_filter_resources(f); remove_hf(sc, f); if (f->locked == 0) free(f, M_CXGBE); } cv_broadcast(&sc->tids.hftid_cv); done: mtx_unlock(&sc->tids.hftid_lock); free_atid(sc, atid); return (0); } int t4_hashfilter_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_set_tcb_rpl *rpl = (const void *)(rss + 1); u_int tid = GET_TID(rpl); struct filter_entry *f; mtx_lock(&sc->tids.hftid_lock); f = lookup_hftid(sc, tid); KASSERT(f->tid == tid, ("%s: filter tid mismatch", __func__)); KASSERT(f->pending, ("%s: hashfilter %p [%u] isn't pending.", __func__, f, tid)); KASSERT(f->valid == 0, ("%s: hashfilter %p [%u] is valid already.", __func__, f, tid)); f->pending = 0; if (rpl->status == 0) { f->valid = 1; } else { f->tid = EIO; f->valid = 0; free_filter_resources(f); remove_hftid(sc, f); remove_hf(sc, f); release_tid(sc, tid, &sc->sge.ctrlq[0]); if (f->locked == 0) free(f, M_CXGBE); } cv_broadcast(&sc->tids.hftid_cv); mtx_unlock(&sc->tids.hftid_lock); return (0); } int t4_del_hashfilter_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct filter_entry *f; mtx_lock(&sc->tids.hftid_lock); f = lookup_hftid(sc, tid); KASSERT(f->tid == tid, ("%s: filter tid mismatch", __func__)); KASSERT(f->pending, ("%s: hashfilter %p [%u] isn't pending.", __func__, f, tid)); KASSERT(f->valid, ("%s: hashfilter %p [%u] isn't valid.", __func__, f, tid)); f->pending = 0; if (cpl->status == 0) { f->valid = 0; free_filter_resources(f); remove_hftid(sc, f); remove_hf(sc, f); release_tid(sc, tid, &sc->sge.ctrlq[0]); if (f->locked == 0) free(f, M_CXGBE); } cv_broadcast(&sc->tids.hftid_cv); mtx_unlock(&sc->tids.hftid_lock); return (0); } static int get_tcamfilter(struct adapter *sc, struct t4_filter *t) { int i, nfilters; struct filter_entry *f; u_int in_use; #ifdef INVARIANTS u_int tid_base; #endif MPASS(!t->fs.hash); if (separate_hpfilter_region(sc) && t->fs.prio) { nfilters = sc->tids.nhpftids; f = sc->tids.hpftid_tab; in_use = sc->tids.hpftids_in_use; #ifdef INVARIANTS tid_base = sc->tids.hpftid_base; #endif } else { nfilters = sc->tids.nftids; f = sc->tids.ftid_tab; in_use = sc->tids.ftids_in_use; #ifdef INVARIANTS tid_base = sc->tids.ftid_base; #endif } if (in_use == 0 || f == NULL || t->idx >= nfilters) { t->idx = 0xffffffff; return (0); } f += t->idx; mtx_lock(&sc->tids.ftid_lock); for (i = t->idx; i < nfilters; i++, f++) { if (f->valid) { MPASS(f->tid == tid_base + i); t->idx = i; t->l2tidx = f->l2te ? f->l2te->idx : 0; t->smtidx = f->smt ? f->smt->idx : 0; if (f->fs.hitcnts) t->hits = get_filter_hits(sc, f->tid); else t->hits = UINT64_MAX; t->fs = f->fs; goto done; } } t->idx = 0xffffffff; done: mtx_unlock(&sc->tids.ftid_lock); return (0); } static int get_hashfilter(struct adapter *sc, struct t4_filter *t) { struct tid_info *ti = &sc->tids; int tid; struct filter_entry *f; const int inv_tid = ti->ntids + ti->tid_base; MPASS(t->fs.hash); if (ti->tids_in_use == 0 || ti->hftid_hash_tid == NULL || t->idx >= inv_tid) { t->idx = 0xffffffff; return (0); } if (t->idx < ti->tid_base) t->idx = ti->tid_base; mtx_lock(&ti->hftid_lock); for (tid = t->idx; tid < inv_tid; tid++) { f = lookup_hftid(sc, tid); if (f != NULL && f->valid) { t->idx = tid; t->l2tidx = f->l2te ? f->l2te->idx : 0; t->smtidx = f->smt ? f->smt->idx : 0; if (f->fs.hitcnts) t->hits = get_filter_hits(sc, tid); else t->hits = UINT64_MAX; t->fs = f->fs; goto done; } } t->idx = 0xffffffff; done: mtx_unlock(&ti->hftid_lock); return (0); } static void mk_act_open_req6(struct adapter *sc, struct filter_entry *f, int atid, uint64_t ftuple, struct cpl_act_open_req6 *cpl) { struct cpl_t5_act_open_req6 *cpl5 = (void *)cpl; struct cpl_t6_act_open_req6 *cpl6 = (void *)cpl; /* Review changes to CPL after cpl_t6_act_open_req if this goes off. */ MPASS(chip_id(sc) >= CHELSIO_T5 && chip_id(sc) <= CHELSIO_T6); MPASS(atid >= 0); if (chip_id(sc) == CHELSIO_T5) { INIT_TP_WR(cpl5, 0); } else { INIT_TP_WR(cpl6, 0); cpl6->rsvd2 = 0; cpl6->opt3 = 0; } OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ6, V_TID_QID(sc->sge.fwq.abs_id) | V_TID_TID(atid) | V_TID_COOKIE(CPL_COOKIE_HASHFILTER))); cpl->local_port = htobe16(f->fs.val.dport); cpl->peer_port = htobe16(f->fs.val.sport); cpl->local_ip_hi = *(uint64_t *)(&f->fs.val.dip); cpl->local_ip_lo = *(((uint64_t *)&f->fs.val.dip) + 1); cpl->peer_ip_hi = *(uint64_t *)(&f->fs.val.sip); cpl->peer_ip_lo = *(((uint64_t *)&f->fs.val.sip) + 1); cpl->opt0 = htobe64(V_NAGLE(f->fs.newvlan == VLAN_REMOVE || f->fs.newvlan == VLAN_REWRITE) | V_DELACK(f->fs.hitcnts) | V_L2T_IDX(f->l2te ? f->l2te->idx : 0) | V_TX_CHAN(f->fs.eport) | V_NO_CONG(f->fs.rpttid) | V_ULP_MODE(f->fs.nat_mode ? ULP_MODE_TCPDDP : ULP_MODE_NONE) | F_TCAM_BYPASS | F_NON_OFFLOAD); cpl6->params = htobe64(V_FILTER_TUPLE(ftuple)); cpl6->opt2 = htobe32(F_RSS_QUEUE_VALID | V_RSS_QUEUE(f->fs.iq) | V_TX_QUEUE(f->fs.nat_mode) | V_WND_SCALE_EN(f->fs.nat_flag_chk) | V_RX_FC_DISABLE(f->fs.nat_seq_chk ? 1 : 0) | F_T5_OPT_2_VALID | F_RX_CHANNEL | V_SACK_EN(f->fs.swapmac) | V_CONG_CNTRL((f->fs.action == FILTER_DROP) | (f->fs.dirsteer << 1)) | V_PACE(f->fs.maskhash | (f->fs.dirsteerhash << 1))); } static void mk_act_open_req(struct adapter *sc, struct filter_entry *f, int atid, uint64_t ftuple, struct cpl_act_open_req *cpl) { struct cpl_t5_act_open_req *cpl5 = (void *)cpl; struct cpl_t6_act_open_req *cpl6 = (void *)cpl; /* Review changes to CPL after cpl_t6_act_open_req if this goes off. */ MPASS(chip_id(sc) >= CHELSIO_T5 && chip_id(sc) <= CHELSIO_T6); MPASS(atid >= 0); if (chip_id(sc) == CHELSIO_T5) { INIT_TP_WR(cpl5, 0); } else { INIT_TP_WR(cpl6, 0); cpl6->rsvd2 = 0; cpl6->opt3 = 0; } OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, V_TID_QID(sc->sge.fwq.abs_id) | V_TID_TID(atid) | V_TID_COOKIE(CPL_COOKIE_HASHFILTER))); cpl->local_port = htobe16(f->fs.val.dport); cpl->peer_port = htobe16(f->fs.val.sport); cpl->local_ip = f->fs.val.dip[0] | f->fs.val.dip[1] << 8 | f->fs.val.dip[2] << 16 | f->fs.val.dip[3] << 24; cpl->peer_ip = f->fs.val.sip[0] | f->fs.val.sip[1] << 8 | f->fs.val.sip[2] << 16 | f->fs.val.sip[3] << 24; cpl->opt0 = htobe64(V_NAGLE(f->fs.newvlan == VLAN_REMOVE || f->fs.newvlan == VLAN_REWRITE) | V_DELACK(f->fs.hitcnts) | V_L2T_IDX(f->l2te ? f->l2te->idx : 0) | V_TX_CHAN(f->fs.eport) | V_NO_CONG(f->fs.rpttid) | V_ULP_MODE(f->fs.nat_mode ? ULP_MODE_TCPDDP : ULP_MODE_NONE) | F_TCAM_BYPASS | F_NON_OFFLOAD); cpl6->params = htobe64(V_FILTER_TUPLE(ftuple)); cpl6->opt2 = htobe32(F_RSS_QUEUE_VALID | V_RSS_QUEUE(f->fs.iq) | V_TX_QUEUE(f->fs.nat_mode) | V_WND_SCALE_EN(f->fs.nat_flag_chk) | V_RX_FC_DISABLE(f->fs.nat_seq_chk ? 1 : 0) | F_T5_OPT_2_VALID | F_RX_CHANNEL | V_SACK_EN(f->fs.swapmac) | V_CONG_CNTRL((f->fs.action == FILTER_DROP) | (f->fs.dirsteer << 1)) | V_PACE(f->fs.maskhash | (f->fs.dirsteerhash << 1))); } static int act_open_cpl_len16(struct adapter *sc, int isipv6) { int idx; static const int sz_table[4][2] = { { howmany(sizeof (struct cpl_act_open_req), 16), howmany(sizeof (struct cpl_act_open_req6), 16) }, { howmany(sizeof (struct cpl_t5_act_open_req), 16), howmany(sizeof (struct cpl_t5_act_open_req6), 16) }, { howmany(sizeof (struct cpl_t6_act_open_req), 16), howmany(sizeof (struct cpl_t6_act_open_req6), 16) }, { howmany(sizeof (struct cpl_t7_act_open_req), 16), howmany(sizeof (struct cpl_t7_act_open_req6), 16) }, }; MPASS(chip_id(sc) >= CHELSIO_T4); idx = min(chip_id(sc) - CHELSIO_T4, 3); return (sz_table[idx][!!isipv6]); } static int set_hashfilter(struct adapter *sc, struct t4_filter *t, uint64_t ftuple, struct l2t_entry *l2te, struct smt_entry *smt) { void *wr; struct wrq_cookie cookie; struct filter_entry *f; int rc, atid = -1; uint32_t hash; MPASS(t->fs.hash); /* Already validated against fconf, iconf */ MPASS((t->fs.val.pfvf_vld & t->fs.val.ovlan_vld) == 0); MPASS((t->fs.mask.pfvf_vld & t->fs.mask.ovlan_vld) == 0); hash = hf_hashfn_4t(&t->fs); mtx_lock(&sc->tids.hftid_lock); if (lookup_hf(sc, &t->fs, hash) != NULL) { rc = EEXIST; goto done; } f = malloc(sizeof(*f), M_CXGBE, M_ZERO | M_NOWAIT); if (__predict_false(f == NULL)) { rc = ENOMEM; goto done; } f->fs = t->fs; f->l2te = l2te; f->smt = smt; atid = alloc_atid(sc, f); if (__predict_false(atid) == -1) { free(f, M_CXGBE); rc = EAGAIN; goto done; } MPASS(atid >= 0); wr = start_wrq_wr(&sc->sge.ctrlq[0], act_open_cpl_len16(sc, f->fs.type), &cookie); if (wr == NULL) { free_atid(sc, atid); free(f, M_CXGBE); rc = ENOMEM; goto done; } if (f->fs.type) mk_act_open_req6(sc, f, atid, ftuple, wr); else mk_act_open_req(sc, f, atid, ftuple, wr); f->locked = 1; /* ithread mustn't free f if ioctl is still around. */ f->pending = 1; f->tid = -1; insert_hf(sc, f, hash); commit_wrq_wr(&sc->sge.ctrlq[0], wr, &cookie); for (;;) { MPASS(f->locked); if (f->pending == 0) { if (f->valid) { rc = 0; f->locked = 0; t->idx = f->tid; } else { rc = f->tid; free(f, M_CXGBE); } break; } if (cv_wait_sig(&sc->tids.hftid_cv, &sc->tids.hftid_lock) != 0) { f->locked = 0; rc = EINPROGRESS; break; } } done: mtx_unlock(&sc->tids.hftid_lock); return (rc); } /* ABORT_REQ sent as a ULP command looks like this */ #define LEN__ABORT_REQ_ULP (sizeof(struct ulp_txpkt) + \ sizeof(struct ulptx_idata) + sizeof(struct cpl_abort_req_core)) static void * mk_abort_req_ulp(struct ulp_txpkt *ulpmc, uint32_t tid) { struct ulptx_idata *ulpsc; struct cpl_abort_req_core *req; ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0)); ulpmc->len = htobe32(howmany(LEN__ABORT_REQ_ULP, 16)); ulpsc = (struct ulptx_idata *)(ulpmc + 1); ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); ulpsc->len = htobe32(sizeof(*req)); req = (struct cpl_abort_req_core *)(ulpsc + 1); OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_ABORT_REQ, tid)); req->rsvd0 = htonl(0); req->rsvd1 = 0; req->cmd = CPL_ABORT_NO_RST; ulpsc = (struct ulptx_idata *)(req + 1); if (LEN__ABORT_REQ_ULP % 16) { ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); ulpsc->len = htobe32(0); return (ulpsc + 1); } return (ulpsc); } /* ABORT_RPL sent as a ULP command looks like this */ #define LEN__ABORT_RPL_ULP (sizeof(struct ulp_txpkt) + \ sizeof(struct ulptx_idata) + sizeof(struct cpl_abort_rpl_core)) static void * mk_abort_rpl_ulp(struct ulp_txpkt *ulpmc, uint32_t tid) { struct ulptx_idata *ulpsc; struct cpl_abort_rpl_core *rpl; ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0)); ulpmc->len = htobe32(howmany(LEN__ABORT_RPL_ULP, 16)); ulpsc = (struct ulptx_idata *)(ulpmc + 1); ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); ulpsc->len = htobe32(sizeof(*rpl)); rpl = (struct cpl_abort_rpl_core *)(ulpsc + 1); OPCODE_TID(rpl) = htobe32(MK_OPCODE_TID(CPL_ABORT_RPL, tid)); rpl->rsvd0 = htonl(0); rpl->rsvd1 = 0; rpl->cmd = CPL_ABORT_NO_RST; ulpsc = (struct ulptx_idata *)(rpl + 1); if (LEN__ABORT_RPL_ULP % 16) { ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); ulpsc->len = htobe32(0); return (ulpsc + 1); } return (ulpsc); } static inline int del_hashfilter_wrlen(void) { return (sizeof(struct work_request_hdr) + roundup2(LEN__SET_TCB_FIELD_ULP, 16) + roundup2(LEN__ABORT_REQ_ULP, 16) + roundup2(LEN__ABORT_RPL_ULP, 16)); } static void mk_del_hashfilter_wr(struct adapter *sc, int tid, struct work_request_hdr *wrh, int wrlen, int qid) { struct ulp_txpkt *ulpmc; INIT_ULPTX_WRH(wrh, wrlen, 0, 0); ulpmc = (struct ulp_txpkt *)(wrh + 1); ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, tid, W_TCB_RSS_INFO, V_TCB_RSS_INFO(M_TCB_RSS_INFO), V_TCB_RSS_INFO(qid)); ulpmc = mk_abort_req_ulp(ulpmc, tid); ulpmc = mk_abort_rpl_ulp(ulpmc, tid); } static int del_hashfilter(struct adapter *sc, struct t4_filter *t) { struct tid_info *ti = &sc->tids; void *wr; struct filter_entry *f; struct wrq_cookie cookie; int rc; const int wrlen = del_hashfilter_wrlen(); const int inv_tid = ti->ntids + ti->tid_base; MPASS(sc->tids.hftid_hash_4t != NULL); MPASS(sc->tids.ntids > 0); if (t->idx < sc->tids.tid_base || t->idx >= inv_tid) return (EINVAL); mtx_lock(&ti->hftid_lock); f = lookup_hftid(sc, t->idx); if (f == NULL || f->valid == 0) { rc = EINVAL; goto done; } MPASS(f->tid == t->idx); if (f->locked) { rc = EPERM; goto done; } if (f->pending) { rc = EBUSY; goto done; } wr = start_wrq_wr(&sc->sge.ctrlq[0], howmany(wrlen, 16), &cookie); if (wr == NULL) { rc = ENOMEM; goto done; } mk_del_hashfilter_wr(sc, t->idx, wr, wrlen, sc->sge.fwq.abs_id); f->locked = 1; f->pending = 1; commit_wrq_wr(&sc->sge.ctrlq[0], wr, &cookie); t->fs = f->fs; /* extra info for the caller */ for (;;) { MPASS(f->locked); if (f->pending == 0) { if (f->valid) { f->locked = 0; rc = EIO; } else { rc = 0; free(f, M_CXGBE); } break; } if (cv_wait_sig(&ti->hftid_cv, &ti->hftid_lock) != 0) { f->locked = 0; rc = EINPROGRESS; break; } } done: mtx_unlock(&ti->hftid_lock); return (rc); } #define WORD_MASK 0xffffffff static void set_nat_params(struct adapter *sc, struct filter_entry *f, const bool dip, const bool sip, const bool dp, const bool sp) { if (dip) { if (f->fs.type) { set_tcb_field(sc, f->tid, W_TCB_SND_UNA_RAW, WORD_MASK, f->fs.nat_dip[15] | f->fs.nat_dip[14] << 8 | f->fs.nat_dip[13] << 16 | f->fs.nat_dip[12] << 24, 1); set_tcb_field(sc, f->tid, W_TCB_SND_UNA_RAW + 1, WORD_MASK, f->fs.nat_dip[11] | f->fs.nat_dip[10] << 8 | f->fs.nat_dip[9] << 16 | f->fs.nat_dip[8] << 24, 1); set_tcb_field(sc, f->tid, W_TCB_SND_UNA_RAW + 2, WORD_MASK, f->fs.nat_dip[7] | f->fs.nat_dip[6] << 8 | f->fs.nat_dip[5] << 16 | f->fs.nat_dip[4] << 24, 1); set_tcb_field(sc, f->tid, W_TCB_SND_UNA_RAW + 3, WORD_MASK, f->fs.nat_dip[3] | f->fs.nat_dip[2] << 8 | f->fs.nat_dip[1] << 16 | f->fs.nat_dip[0] << 24, 1); } else { set_tcb_field(sc, f->tid, W_TCB_RX_FRAG3_LEN_RAW, WORD_MASK, f->fs.nat_dip[3] | f->fs.nat_dip[2] << 8 | f->fs.nat_dip[1] << 16 | f->fs.nat_dip[0] << 24, 1); } } if (sip) { if (f->fs.type) { set_tcb_field(sc, f->tid, W_TCB_RX_FRAG2_PTR_RAW, WORD_MASK, f->fs.nat_sip[15] | f->fs.nat_sip[14] << 8 | f->fs.nat_sip[13] << 16 | f->fs.nat_sip[12] << 24, 1); set_tcb_field(sc, f->tid, W_TCB_RX_FRAG2_PTR_RAW + 1, WORD_MASK, f->fs.nat_sip[11] | f->fs.nat_sip[10] << 8 | f->fs.nat_sip[9] << 16 | f->fs.nat_sip[8] << 24, 1); set_tcb_field(sc, f->tid, W_TCB_RX_FRAG2_PTR_RAW + 2, WORD_MASK, f->fs.nat_sip[7] | f->fs.nat_sip[6] << 8 | f->fs.nat_sip[5] << 16 | f->fs.nat_sip[4] << 24, 1); set_tcb_field(sc, f->tid, W_TCB_RX_FRAG2_PTR_RAW + 3, WORD_MASK, f->fs.nat_sip[3] | f->fs.nat_sip[2] << 8 | f->fs.nat_sip[1] << 16 | f->fs.nat_sip[0] << 24, 1); } else { set_tcb_field(sc, f->tid, W_TCB_RX_FRAG3_START_IDX_OFFSET_RAW, WORD_MASK, f->fs.nat_sip[3] | f->fs.nat_sip[2] << 8 | f->fs.nat_sip[1] << 16 | f->fs.nat_sip[0] << 24, 1); } } set_tcb_field(sc, f->tid, W_TCB_PDU_HDR_LEN, WORD_MASK, (dp ? f->fs.nat_dport : 0) | (sp ? f->fs.nat_sport << 16 : 0), 1); } /* * Returns EINPROGRESS to indicate that at least one TCB update was sent and the * last of the series of updates requested a reply. The reply informs the * driver that the filter is fully setup. */ static int configure_hashfilter_tcb(struct adapter *sc, struct filter_entry *f) { int updated = 0; MPASS(f->tid < sc->tids.ntids); MPASS(f->fs.hash); MPASS(f->pending); MPASS(f->valid == 0); if (f->fs.newdmac) { set_tcb_tflag(sc, f->tid, S_TF_CCTRL_ECE, 1, 1); updated++; } if (f->fs.newvlan == VLAN_INSERT || f->fs.newvlan == VLAN_REWRITE) { set_tcb_tflag(sc, f->tid, S_TF_CCTRL_RFR, 1, 1); updated++; } if (f->fs.newsmac) { MPASS(f->smt != NULL); set_tcb_tflag(sc, f->tid, S_TF_CCTRL_CWR, 1, 1); set_tcb_field(sc, f->tid, W_TCB_SMAC_SEL, V_TCB_SMAC_SEL(M_TCB_SMAC_SEL), V_TCB_SMAC_SEL(f->smt->idx), 1); updated++; } switch(f->fs.nat_mode) { case NAT_MODE_NONE: break; case NAT_MODE_DIP: set_nat_params(sc, f, true, false, false, false); updated++; break; case NAT_MODE_DIP_DP: set_nat_params(sc, f, true, false, true, false); updated++; break; case NAT_MODE_DIP_DP_SIP: set_nat_params(sc, f, true, true, true, false); updated++; break; case NAT_MODE_DIP_DP_SP: set_nat_params(sc, f, true, false, true, true); updated++; break; case NAT_MODE_SIP_SP: set_nat_params(sc, f, false, true, false, true); updated++; break; case NAT_MODE_DIP_SIP_SP: set_nat_params(sc, f, true, true, false, true); updated++; break; case NAT_MODE_ALL: set_nat_params(sc, f, true, true, true, true); updated++; break; default: MPASS(0); /* should have been validated earlier */ break; } if (f->fs.nat_seq_chk) { set_tcb_field(sc, f->tid, W_TCB_RCV_NXT, V_TCB_RCV_NXT(M_TCB_RCV_NXT), V_TCB_RCV_NXT(f->fs.nat_seq_chk), 1); updated++; } if (is_t5(sc) && f->fs.action == FILTER_DROP) { /* * Migrating = 1, Non-offload = 0 to get a T5 hashfilter to drop. */ set_tcb_field(sc, f->tid, W_TCB_T_FLAGS, V_TF_NON_OFFLOAD(1) | V_TF_MIGRATING(1), V_TF_MIGRATING(1), 1); updated++; } /* * Enable switching after all secondary resources (L2T entry, SMT entry, * etc.) are setup so that any switched packet will use correct * values. */ if (f->fs.action == FILTER_SWITCH) { set_tcb_tflag(sc, f->tid, S_TF_CCTRL_ECN, 1, 1); updated++; } if (f->fs.hitcnts || updated > 0) { set_tcb_field(sc, f->tid, W_TCB_TIMESTAMP, V_TCB_TIMESTAMP(M_TCB_TIMESTAMP) | V_TCB_T_RTT_TS_RECENT_AGE(M_TCB_T_RTT_TS_RECENT_AGE), V_TCB_TIMESTAMP(0ULL) | V_TCB_T_RTT_TS_RECENT_AGE(0ULL), 0); return (EINPROGRESS); } return (0); } diff --git a/sys/dev/cxgbe/tom/t4_cpl_io.c b/sys/dev/cxgbe/tom/t4_cpl_io.c index 402aa29c05cb..84e31efa8b58 100644 --- a/sys/dev/cxgbe/tom/t4_cpl_io.c +++ b/sys/dev/cxgbe/tom/t4_cpl_io.c @@ -1,2561 +1,2570 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2012, 2015 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_ratelimit.h" #ifdef TCP_OFFLOAD #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include #include #include #include #include #include #include #include #include #include #include #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_tcb.h" #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" static void t4_aiotx_cancel(struct kaiocb *job); static void t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep); void send_flowc_wr(struct toepcb *toep, struct tcpcb *tp) { struct wrqe *wr; struct fw_flowc_wr *flowc; unsigned int nparams, flowclen, paramidx; struct vi_info *vi = toep->vi; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; unsigned int pfvf = sc->pf << S_FW_VIID_PFN; struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT), ("%s: flowc for tid %u sent already", __func__, toep->tid)); if (tp != NULL) nparams = 8; else nparams = 6; if (toep->params.tc_idx != -1) { MPASS(toep->params.tc_idx >= 0 && toep->params.tc_idx < sc->params.nsched_cls); nparams++; } flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); wr = alloc_wrqe(roundup2(flowclen, 16), &toep->ofld_txq->wrq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } flowc = wrtod(wr); memset(flowc, 0, wr->wr_len); flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | V_FW_FLOWC_WR_NPARAMS(nparams)); flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | V_FW_WR_FLOWID(toep->tid)); #define FLOWC_PARAM(__m, __v) \ do { \ flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \ flowc->mnemval[paramidx].val = htobe32(__v); \ paramidx++; \ } while (0) paramidx = 0; FLOWC_PARAM(PFNVFN, pfvf); /* Firmware expects hw port and will translate to channel itself. */ FLOWC_PARAM(CH, pi->hw_port); FLOWC_PARAM(PORT, pi->hw_port); FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id); FLOWC_PARAM(SNDBUF, toep->params.sndbuf); if (tp) { FLOWC_PARAM(MSS, toep->params.emss); FLOWC_PARAM(SNDNXT, tp->snd_nxt); FLOWC_PARAM(RCVNXT, tp->rcv_nxt); } else FLOWC_PARAM(MSS, 512); CTR6(KTR_CXGBE, "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x", __func__, toep->tid, toep->params.emss, toep->params.sndbuf, tp ? tp->snd_nxt : 0, tp ? tp->rcv_nxt : 0); if (toep->params.tc_idx != -1) FLOWC_PARAM(SCHEDCLASS, toep->params.tc_idx); #undef FLOWC_PARAM KASSERT(paramidx == nparams, ("nparams mismatch")); KASSERT(howmany(flowclen, 16) <= MAX_OFLD_TX_SDESC_CREDITS, ("%s: tx_credits %u too large", __func__, howmany(flowclen, 16))); txsd->tx_credits = howmany(flowclen, 16); txsd->plen = 0; KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, ("%s: not enough credits (%d)", __func__, toep->tx_credits)); toep->tx_credits -= txsd->tx_credits; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) toep->txsd_pidx = 0; toep->txsd_avail--; toep->flags |= TPF_FLOWC_WR_SENT; t4_wrq_tx(sc, wr); } #ifdef RATELIMIT /* * Input is Bytes/second (so_max_pacing_rate), chip counts in Kilobits/second. */ static int update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps) { int tc_idx, rc; const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000; const int port_id = toep->vi->pi->port_id; CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps); if (kbps == 0) { /* unbind */ tc_idx = -1; } else { rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx); if (rc != 0) return (rc); MPASS(tc_idx >= 0 && tc_idx < sc->params.nsched_cls); } if (toep->params.tc_idx != tc_idx) { struct wrqe *wr; struct fw_flowc_wr *flowc; int nparams = 1, flowclen, flowclen16; struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); flowclen16 = howmany(flowclen, 16); if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 || (wr = alloc_wrqe(roundup2(flowclen, 16), &toep->ofld_txq->wrq)) == NULL) { if (tc_idx >= 0) t4_release_cl_rl(sc, port_id, tc_idx); return (ENOMEM); } flowc = wrtod(wr); memset(flowc, 0, wr->wr_len); flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | V_FW_FLOWC_WR_NPARAMS(nparams)); flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) | V_FW_WR_FLOWID(toep->tid)); flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS; if (tc_idx == -1) flowc->mnemval[0].val = htobe32(0xff); else flowc->mnemval[0].val = htobe32(tc_idx); KASSERT(flowclen16 <= MAX_OFLD_TX_SDESC_CREDITS, ("%s: tx_credits %u too large", __func__, flowclen16)); txsd->tx_credits = flowclen16; txsd->plen = 0; toep->tx_credits -= txsd->tx_credits; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) toep->txsd_pidx = 0; toep->txsd_avail--; t4_wrq_tx(sc, wr); } if (toep->params.tc_idx >= 0) t4_release_cl_rl(sc, port_id, toep->params.tc_idx); toep->params.tc_idx = tc_idx; return (0); } #endif void send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt) { struct wrqe *wr; struct cpl_abort_req *req; int tid = toep->tid; struct inpcb *inp = toep->inp; struct tcpcb *tp = intotcpcb(inp); /* don't use if INP_DROPPED */ INP_WLOCK_ASSERT(inp); CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s", __func__, toep->tid, inp->inp_flags & INP_DROPPED ? "inp dropped" : tcpstates[tp->t_state], toep->flags, inp->inp_flags, toep->flags & TPF_ABORT_SHUTDOWN ? " (abort already in progress)" : ""); if (toep->flags & TPF_ABORT_SHUTDOWN) return; /* abort already in progress */ toep->flags |= TPF_ABORT_SHUTDOWN; KASSERT(toep->flags & TPF_FLOWC_WR_SENT, ("%s: flowc_wr not sent for tid %d.", __func__, tid)); wr = alloc_wrqe(sizeof(*req), &toep->ofld_txq->wrq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } req = wrtod(wr); INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid); if (inp->inp_flags & INP_DROPPED) req->rsvd0 = htobe32(snd_nxt); else req->rsvd0 = htobe32(tp->snd_nxt); req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT); req->cmd = CPL_ABORT_SEND_RST; /* * XXX: What's the correct way to tell that the inp hasn't been detached * from its socket? Should I even be flushing the snd buffer here? */ if ((inp->inp_flags & INP_DROPPED) == 0) { struct socket *so = inp->inp_socket; if (so != NULL) /* because I'm not sure. See comment above */ sbflush(&so->so_snd); } t4_l2t_send(sc, wr, toep->l2te); } /* * Called when a connection is established to translate the TCP options * reported by HW to FreeBSD's native format. */ static void assign_rxopt(struct tcpcb *tp, uint16_t opt) { struct toepcb *toep = tp->t_toe; struct inpcb *inp = tptoinpcb(tp); struct adapter *sc = td_adapter(toep->td); INP_LOCK_ASSERT(inp); toep->params.mtu_idx = G_TCPOPT_MSS(opt); tp->t_maxseg = sc->params.mtus[toep->params.mtu_idx]; if (inp->inp_inc.inc_flags & INC_ISIPV6) tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr); toep->params.emss = tp->t_maxseg; if (G_TCPOPT_TSTAMP(opt)) { toep->params.tstamp = 1; toep->params.emss -= TCPOLEN_TSTAMP_APPA; tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */ tp->ts_recent = 0; /* hmmm */ tp->ts_recent_age = tcp_ts_getticks(); } else toep->params.tstamp = 0; if (G_TCPOPT_SACK(opt)) { toep->params.sack = 1; tp->t_flags |= TF_SACK_PERMIT; /* should already be set */ } else { toep->params.sack = 0; tp->t_flags &= ~TF_SACK_PERMIT; /* sack disallowed by peer */ } if (G_TCPOPT_WSCALE_OK(opt)) tp->t_flags |= TF_RCVD_SCALE; /* Doing window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == (TF_RCVD_SCALE | TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; tp->snd_scale = G_TCPOPT_SND_WSCALE(opt); } else toep->params.wscale = 0; CTR6(KTR_CXGBE, "assign_rxopt: tid %d, mtu_idx %u, emss %u, ts %u, sack %u, wscale %u", toep->tid, toep->params.mtu_idx, toep->params.emss, toep->params.tstamp, toep->params.sack, toep->params.wscale); } /* * Completes some final bits of initialization for just established connections * and changes their state to TCPS_ESTABLISHED. * * The ISNs are from the exchange of SYNs. */ void make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt) { struct inpcb *inp = toep->inp; struct socket *so = inp->inp_socket; struct tcpcb *tp = intotcpcb(inp); uint16_t tcpopt = be16toh(opt); INP_WLOCK_ASSERT(inp); KASSERT(tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED, ("%s: TCP state %s", __func__, tcpstates[tp->t_state])); CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p", __func__, toep->tid, so, inp, tp, toep); tcp_state_change(tp, TCPS_ESTABLISHED); tp->t_starttime = ticks; TCPSTAT_INC(tcps_connects); tp->irs = irs; tcp_rcvseqinit(tp); tp->rcv_wnd = (u_int)toep->params.opt0_bufsize << 10; tp->rcv_adv += tp->rcv_wnd; tp->last_ack_sent = tp->rcv_nxt; tp->iss = iss; tcp_sendseqinit(tp); tp->snd_una = iss + 1; tp->snd_nxt = iss + 1; tp->snd_max = iss + 1; assign_rxopt(tp, tcpopt); send_flowc_wr(toep, tp); soisconnected(so); } int send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits) { struct wrqe *wr; struct cpl_rx_data_ack *req; uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); KASSERT(credits >= 0, ("%s: %d credits", __func__, credits)); wr = alloc_wrqe(sizeof(*req), toep->ctrlq); if (wr == NULL) return (0); req = wrtod(wr); INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); req->credit_dack = htobe32(dack | V_RX_CREDITS(credits)); t4_wrq_tx(sc, wr); return (credits); } void t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp) { struct adapter *sc = tod->tod_softc; struct inpcb *inp = tptoinpcb(tp); struct socket *so = inp->inp_socket; struct sockbuf *sb = &so->so_rcv; struct toepcb *toep = tp->t_toe; int rx_credits; INP_WLOCK_ASSERT(inp); SOCKBUF_LOCK_ASSERT(sb); rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; if (rx_credits > 0 && (tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 || (rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) || sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) { rx_credits = send_rx_credits(sc, toep, rx_credits); tp->rcv_wnd += rx_credits; tp->rcv_adv += rx_credits; } } void t4_rcvd(struct toedev *tod, struct tcpcb *tp) { struct inpcb *inp = tptoinpcb(tp); struct socket *so = inp->inp_socket; struct sockbuf *sb = &so->so_rcv; SOCKBUF_LOCK(sb); t4_rcvd_locked(tod, tp); SOCKBUF_UNLOCK(sb); } /* * Close a connection by sending a CPL_CLOSE_CON_REQ message. */ int t4_close_conn(struct adapter *sc, struct toepcb *toep) { struct wrqe *wr; struct cpl_close_con_req *req; unsigned int tid = toep->tid; CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid, toep->flags & TPF_FIN_SENT ? ", IGNORED" : ""); if (toep->flags & TPF_FIN_SENT) return (0); KASSERT(toep->flags & TPF_FLOWC_WR_SENT, ("%s: flowc_wr not sent for tid %u.", __func__, tid)); wr = alloc_wrqe(sizeof(*req), &toep->ofld_txq->wrq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } req = wrtod(wr); req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) | V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr))); req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) | V_FW_WR_FLOWID(tid)); req->wr.wr_lo = cpu_to_be64(0); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); req->rsvd = 0; toep->flags |= TPF_FIN_SENT; toep->flags &= ~TPF_SEND_FIN; t4_l2t_send(sc, wr, toep->l2te); return (0); } #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16) #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16)) #define MIN_ISO_TX_CREDITS (howmany(sizeof(struct cpl_tx_data_iso), 16)) #define MIN_TX_CREDITS(iso) \ (MIN_OFLD_TX_CREDITS + ((iso) ? MIN_ISO_TX_CREDITS : 0)) _Static_assert(MAX_OFLD_TX_CREDITS <= MAX_OFLD_TX_SDESC_CREDITS, "MAX_OFLD_TX_SDESC_CREDITS too small"); /* Maximum amount of immediate data we could stuff in a WR */ static inline int max_imm_payload(int tx_credits, int iso) { const int iso_cpl_size = iso ? sizeof(struct cpl_tx_data_iso) : 0; const int n = 1; /* Use no more than one desc for imm. data WR */ KASSERT(tx_credits >= 0 && tx_credits <= MAX_OFLD_TX_CREDITS, ("%s: %d credits", __func__, tx_credits)); if (tx_credits < MIN_TX_CREDITS(iso)) return (0); if (tx_credits >= (n * EQ_ESIZE) / 16) return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr) - iso_cpl_size); else return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr) - iso_cpl_size); } /* Maximum number of SGL entries we could stuff in a WR */ static inline int max_dsgl_nsegs(int tx_credits, int iso) { int nseg = 1; /* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */ int sge_pair_credits = tx_credits - MIN_TX_CREDITS(iso); KASSERT(tx_credits >= 0 && tx_credits <= MAX_OFLD_TX_CREDITS, ("%s: %d credits", __func__, tx_credits)); if (tx_credits < MIN_TX_CREDITS(iso)) return (0); nseg += 2 * (sge_pair_credits * 16 / 24); if ((sge_pair_credits * 16) % 24 == 16) nseg++; return (nseg); } static inline void write_tx_wr(void *dst, struct toepcb *toep, int fw_wr_opcode, unsigned int immdlen, unsigned int plen, uint8_t credits, int shove, int ulp_submode) { struct fw_ofld_tx_data_wr *txwr = dst; txwr->op_to_immdlen = htobe32(V_WR_OP(fw_wr_opcode) | V_FW_WR_IMMDLEN(immdlen)); txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) | V_FW_WR_LEN16(credits)); txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ulp_mode(toep)) | V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove)); txwr->plen = htobe32(plen); if (toep->params.tx_align > 0) { if (plen < 2 * toep->params.emss) txwr->lsodisable_to_flags |= htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE); else txwr->lsodisable_to_flags |= htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD | (toep->params.nagle == 0 ? 0 : F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE)); } } /* * Generate a DSGL from a starting mbuf. The total number of segments and the * maximum segments in any one mbuf are provided. */ static void write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n) { struct mbuf *m; struct ulptx_sgl *usgl = dst; int i, j, rc; struct sglist sg; struct sglist_seg segs[n]; KASSERT(nsegs > 0, ("%s: nsegs 0", __func__)); sglist_init(&sg, n, segs); usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | V_ULPTX_NSGE(nsegs)); i = -1; for (m = start; m != stop; m = m->m_next) { if (m->m_flags & M_EXTPG) rc = sglist_append_mbuf_epg(&sg, m, mtod(m, vm_offset_t), m->m_len); else rc = sglist_append(&sg, mtod(m, void *), m->m_len); if (__predict_false(rc != 0)) panic("%s: sglist_append %d", __func__, rc); for (j = 0; j < sg.sg_nseg; i++, j++) { if (i < 0) { usgl->len0 = htobe32(segs[j].ss_len); usgl->addr0 = htobe64(segs[j].ss_paddr); } else { usgl->sge[i / 2].len[i & 1] = htobe32(segs[j].ss_len); usgl->sge[i / 2].addr[i & 1] = htobe64(segs[j].ss_paddr); } #ifdef INVARIANTS nsegs--; #endif } sglist_reset(&sg); } if (i & 1) usgl->sge[i / 2].len[1] = htobe32(0); KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p", __func__, nsegs, start, stop)); } bool t4_push_raw_wr(struct adapter *sc, struct toepcb *toep, struct mbuf *m) { #ifdef INVARIANTS struct inpcb *inp = toep->inp; #endif struct wrqe *wr; struct ofld_tx_sdesc *txsd; u_int credits, plen; INP_WLOCK_ASSERT(inp); MPASS(mbuf_raw_wr(m)); plen = m->m_pkthdr.len; credits = howmany(plen, 16); if (credits > toep->tx_credits) return (false); wr = alloc_wrqe(roundup2(plen, 16), &toep->ofld_txq->wrq); if (wr == NULL) return (false); m_copydata(m, 0, plen, wrtod(wr)); m_freem(m); toep->tx_credits -= credits; if (toep->tx_credits < MIN_OFLD_TX_CREDITS) toep->flags |= TPF_TX_SUSPENDED; KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); KASSERT(credits <= MAX_OFLD_TX_SDESC_CREDITS, ("%s: tx_credits %u too large", __func__, credits)); txsd = &toep->txsd[toep->txsd_pidx]; txsd->plen = 0; txsd->tx_credits = credits; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) toep->txsd_pidx = 0; toep->txsd_avail--; t4_wrq_tx(sc, wr); return (true); } /* * Max number of SGL entries an offload tx work request can have. This is 41 * (1 + 40) for a full 512B work request. * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40) */ #define OFLD_SGL_LEN (41) /* * Send data and/or a FIN to the peer. * * The socket's so_snd buffer consists of a stream of data starting with sb_mb * and linked together with m_next. sb_sndptr, if set, is the last mbuf that * was transmitted. * * drop indicates the number of bytes that should be dropped from the head of * the send buffer. It is an optimization that lets do_fw4_ack avoid creating * contention on the send buffer lock (before this change it used to do * sowwakeup and then t4_push_frames right after that when recovering from tx * stalls). When drop is set this function MUST drop the bytes and wake up any * writers. */ static void t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop) { struct mbuf *sndptr, *m, *sb_sndptr; struct fw_ofld_tx_data_wr *txwr; struct wrqe *wr; u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; struct inpcb *inp = toep->inp; struct tcpcb *tp = intotcpcb(inp); struct socket *so = inp->inp_socket; struct sockbuf *sb = &so->so_snd; struct mbufq *pduq = &toep->ulp_pduq; int tx_credits, shove, compl, sowwakeup; struct ofld_tx_sdesc *txsd; bool nomap_mbuf_seen; INP_WLOCK_ASSERT(inp); KASSERT(toep->flags & TPF_FLOWC_WR_SENT, ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); KASSERT(ulp_mode(toep) == ULP_MODE_NONE || ulp_mode(toep) == ULP_MODE_TCPDDP || ulp_mode(toep) == ULP_MODE_TLS || ulp_mode(toep) == ULP_MODE_RDMA, ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); #ifdef VERBOSE_TRACES CTR5(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d", __func__, toep->tid, toep->flags, tp->t_flags, drop); #endif if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) return; #ifdef RATELIMIT if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) && (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) { inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; } #endif /* * This function doesn't resume by itself. Someone else must clear the * flag and call this function. */ if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { KASSERT(drop == 0, ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); return; } txsd = &toep->txsd[toep->txsd_pidx]; do { tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); max_imm = max_imm_payload(tx_credits, 0); max_nsegs = max_dsgl_nsegs(tx_credits, 0); if (__predict_false((sndptr = mbufq_first(pduq)) != NULL)) { if (!t4_push_raw_wr(sc, toep, sndptr)) { toep->flags |= TPF_TX_SUSPENDED; return; } m = mbufq_dequeue(pduq); MPASS(m == sndptr); txsd = &toep->txsd[toep->txsd_pidx]; continue; } SOCKBUF_LOCK(sb); sowwakeup = drop; if (drop) { sbdrop_locked(sb, drop); drop = 0; } sb_sndptr = sb->sb_sndptr; sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb; plen = 0; nsegs = 0; max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ nomap_mbuf_seen = false; for (m = sndptr; m != NULL; m = m->m_next) { int n; if ((m->m_flags & M_NOTREADY) != 0) break; if (plen + m->m_len > MAX_OFLD_TX_SDESC_PLEN) break; if (m->m_flags & M_EXTPG) { #ifdef KERN_TLS if (m->m_epg_tls != NULL) { toep->flags |= TPF_KTLS; if (plen == 0) { SOCKBUF_UNLOCK(sb); t4_push_ktls(sc, toep, 0); return; } break; } #endif n = sglist_count_mbuf_epg(m, mtod(m, vm_offset_t), m->m_len); } else n = sglist_count(mtod(m, void *), m->m_len); nsegs += n; plen += m->m_len; /* This mbuf sent us _over_ the nsegs limit, back out */ if (plen > max_imm && nsegs > max_nsegs) { nsegs -= n; plen -= m->m_len; if (plen == 0) { /* Too few credits */ toep->flags |= TPF_TX_SUSPENDED; if (sowwakeup) { if (!TAILQ_EMPTY( &toep->aiotx_jobq)) t4_aiotx_queue_toep(so, toep); sowwakeup_locked(so); } else SOCKBUF_UNLOCK(sb); SOCKBUF_UNLOCK_ASSERT(sb); return; } break; } if (m->m_flags & M_EXTPG) nomap_mbuf_seen = true; if (max_nsegs_1mbuf < n) max_nsegs_1mbuf = n; sb_sndptr = m; /* new sb->sb_sndptr if all goes well */ /* This mbuf put us right at the max_nsegs limit */ if (plen > max_imm && nsegs == max_nsegs) { m = m->m_next; break; } } if (sbused(sb) > sb->sb_hiwat * 5 / 8 && toep->plen_nocompl + plen >= sb->sb_hiwat / 4) compl = 1; else compl = 0; if (sb->sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf && sb->sb_hiwat < V_tcp_autosndbuf_max && sbused(sb) >= sb->sb_hiwat * 7 / 8) { int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc, V_tcp_autosndbuf_max); if (!sbreserve_locked(so, SO_SND, newsize, NULL)) sb->sb_flags &= ~SB_AUTOSIZE; else sowwakeup = 1; /* room available */ } if (sowwakeup) { if (!TAILQ_EMPTY(&toep->aiotx_jobq)) t4_aiotx_queue_toep(so, toep); sowwakeup_locked(so); } else SOCKBUF_UNLOCK(sb); SOCKBUF_UNLOCK_ASSERT(sb); /* nothing to send */ if (plen == 0) { KASSERT(m == NULL || (m->m_flags & M_NOTREADY) != 0, ("%s: nothing to send, but m != NULL is ready", __func__)); break; } if (__predict_false(toep->flags & TPF_FIN_SENT)) panic("%s: excess tx.", __func__); shove = m == NULL && !(tp->t_flags & TF_MORETOCOME); if (plen <= max_imm && !nomap_mbuf_seen) { /* Immediate data tx */ wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), &toep->ofld_txq->wrq); if (wr == NULL) { /* XXX: how will we recover from this? */ toep->flags |= TPF_TX_SUSPENDED; return; } txwr = wrtod(wr); credits = howmany(wr->wr_len, 16); write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, plen, plen, credits, shove, 0); m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); nsegs = 0; } else { int wr_len; /* DSGL tx */ wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; wr = alloc_wrqe(roundup2(wr_len, 16), &toep->ofld_txq->wrq); if (wr == NULL) { /* XXX: how will we recover from this? */ toep->flags |= TPF_TX_SUSPENDED; return; } txwr = wrtod(wr); credits = howmany(wr_len, 16); write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, 0, plen, credits, shove, 0); write_tx_sgl(txwr + 1, sndptr, m, nsegs, max_nsegs_1mbuf); if (wr_len & 0xf) { uint64_t *pad = (uint64_t *) ((uintptr_t)txwr + wr_len); *pad = 0; } } KASSERT(toep->tx_credits >= credits, ("%s: not enough credits", __func__)); toep->tx_credits -= credits; toep->tx_nocompl += credits; toep->plen_nocompl += plen; if (toep->tx_credits <= toep->tx_total * 3 / 8 && toep->tx_nocompl >= toep->tx_total / 4) compl = 1; if (compl || ulp_mode(toep) == ULP_MODE_RDMA) { txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); toep->tx_nocompl = 0; toep->plen_nocompl = 0; } tp->snd_nxt += plen; tp->snd_max += plen; SOCKBUF_LOCK(sb); KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__)); sb->sb_sndptr = sb_sndptr; SOCKBUF_UNLOCK(sb); toep->flags |= TPF_TX_DATA_SENT; if (toep->tx_credits < MIN_OFLD_TX_CREDITS) toep->flags |= TPF_TX_SUSPENDED; KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); KASSERT(plen <= MAX_OFLD_TX_SDESC_PLEN, ("%s: plen %u too large", __func__, plen)); txsd->plen = plen; txsd->tx_credits = credits; txsd++; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { toep->txsd_pidx = 0; txsd = &toep->txsd[0]; } toep->txsd_avail--; t4_l2t_send(sc, wr, toep->l2te); } while (m != NULL && (m->m_flags & M_NOTREADY) == 0); /* Send a FIN if requested, but only if there's no more data to send */ if (m == NULL && toep->flags & TPF_SEND_FIN) t4_close_conn(sc, toep); } static inline void rqdrop_locked(struct mbufq *q, int plen) { struct mbuf *m; while (plen > 0) { m = mbufq_dequeue(q); /* Too many credits. */ MPASS(m != NULL); M_ASSERTPKTHDR(m); /* Partial credits. */ MPASS(plen >= m->m_pkthdr.len); plen -= m->m_pkthdr.len; m_freem(m); } } /* * Not a bit in the TCB, but is a bit in the ulp_submode field of the * CPL_TX_DATA flags field in FW_ISCSI_TX_DATA_WR. */ #define ULP_ISO G_TX_ULP_SUBMODE(F_FW_ISCSI_TX_DATA_WR_ULPSUBMODE_ISO) static void write_tx_data_iso(void *dst, u_int ulp_submode, uint8_t flags, uint16_t mss, int len, int npdu) { struct cpl_tx_data_iso *cpl; unsigned int burst_size; unsigned int last; /* * The firmware will set the 'F' bit on the last PDU when * either condition is true: * * - this large PDU is marked as the "last" slice * * - the amount of data payload bytes equals the burst_size * * The strategy used here is to always set the burst_size * artificially high (len includes the size of the template * BHS) and only set the "last" flag if the original PDU had * 'F' set. */ burst_size = len; last = !!(flags & CXGBE_ISO_F); cpl = (struct cpl_tx_data_iso *)dst; cpl->op_to_scsi = htonl(V_CPL_TX_DATA_ISO_OP(CPL_TX_DATA_ISO) | V_CPL_TX_DATA_ISO_FIRST(1) | V_CPL_TX_DATA_ISO_LAST(last) | V_CPL_TX_DATA_ISO_CPLHDRLEN(0) | V_CPL_TX_DATA_ISO_HDRCRC(!!(ulp_submode & ULP_CRC_HEADER)) | V_CPL_TX_DATA_ISO_PLDCRC(!!(ulp_submode & ULP_CRC_DATA)) | V_CPL_TX_DATA_ISO_IMMEDIATE(0) | V_CPL_TX_DATA_ISO_SCSI(CXGBE_ISO_TYPE(flags))); cpl->ahs_len = 0; cpl->mpdu = htons(DIV_ROUND_UP(mss, 4)); cpl->burst_size = htonl(DIV_ROUND_UP(burst_size, 4)); cpl->len = htonl(len); cpl->reserved2_seglen_offset = htonl(0); cpl->datasn_offset = htonl(0); cpl->buffer_offset = htonl(0); cpl->reserved3 = 0; } static struct wrqe * write_iscsi_mbuf_wr(struct toepcb *toep, struct mbuf *sndptr) { struct mbuf *m; struct fw_ofld_tx_data_wr *txwr; struct cpl_tx_data_iso *cpl_iso; void *p; struct wrqe *wr; u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; u_int adjusted_plen, imm_data, ulp_submode; struct inpcb *inp = toep->inp; struct tcpcb *tp = intotcpcb(inp); int tx_credits, shove, npdu, wr_len; uint16_t iso_mss; static const u_int ulp_extra_len[] = {0, 4, 4, 8}; bool iso, nomap_mbuf_seen; M_ASSERTPKTHDR(sndptr); tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); if (mbuf_raw_wr(sndptr)) { plen = sndptr->m_pkthdr.len; KASSERT(plen <= SGE_MAX_WR_LEN, ("raw WR len %u is greater than max WR len", plen)); if (plen > tx_credits * 16) return (NULL); wr = alloc_wrqe(roundup2(plen, 16), &toep->ofld_txq->wrq); if (__predict_false(wr == NULL)) return (NULL); m_copydata(sndptr, 0, plen, wrtod(wr)); return (wr); } iso = mbuf_iscsi_iso(sndptr); max_imm = max_imm_payload(tx_credits, iso); max_nsegs = max_dsgl_nsegs(tx_credits, iso); iso_mss = mbuf_iscsi_iso_mss(sndptr); plen = 0; nsegs = 0; max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ nomap_mbuf_seen = false; for (m = sndptr; m != NULL; m = m->m_next) { int n; if (m->m_flags & M_EXTPG) n = sglist_count_mbuf_epg(m, mtod(m, vm_offset_t), m->m_len); else n = sglist_count(mtod(m, void *), m->m_len); nsegs += n; plen += m->m_len; /* * This mbuf would send us _over_ the nsegs limit. * Suspend tx because the PDU can't be sent out. */ if ((nomap_mbuf_seen || plen > max_imm) && nsegs > max_nsegs) return (NULL); if (m->m_flags & M_EXTPG) nomap_mbuf_seen = true; if (max_nsegs_1mbuf < n) max_nsegs_1mbuf = n; } if (__predict_false(toep->flags & TPF_FIN_SENT)) panic("%s: excess tx.", __func__); /* * We have a PDU to send. All of it goes out in one WR so 'm' * is NULL. A PDU's length is always a multiple of 4. */ MPASS(m == NULL); MPASS((plen & 3) == 0); MPASS(sndptr->m_pkthdr.len == plen); shove = !(tp->t_flags & TF_MORETOCOME); /* * plen doesn't include header and data digests, which are * generated and inserted in the right places by the TOE, but * they do occupy TCP sequence space and need to be accounted * for. */ ulp_submode = mbuf_ulp_submode(sndptr); MPASS(ulp_submode < nitems(ulp_extra_len)); npdu = iso ? howmany(plen - ISCSI_BHS_SIZE, iso_mss) : 1; adjusted_plen = plen + ulp_extra_len[ulp_submode] * npdu; if (iso) adjusted_plen += ISCSI_BHS_SIZE * (npdu - 1); wr_len = sizeof(*txwr); if (iso) wr_len += sizeof(struct cpl_tx_data_iso); if (plen <= max_imm && !nomap_mbuf_seen) { /* Immediate data tx */ imm_data = plen; wr_len += plen; nsegs = 0; } else { /* DSGL tx */ imm_data = 0; wr_len += sizeof(struct ulptx_sgl) + ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; } wr = alloc_wrqe(roundup2(wr_len, 16), &toep->ofld_txq->wrq); if (wr == NULL) { /* XXX: how will we recover from this? */ return (NULL); } txwr = wrtod(wr); credits = howmany(wr->wr_len, 16); if (iso) { write_tx_wr(txwr, toep, FW_ISCSI_TX_DATA_WR, imm_data + sizeof(struct cpl_tx_data_iso), adjusted_plen, credits, shove, ulp_submode | ULP_ISO); cpl_iso = (struct cpl_tx_data_iso *)(txwr + 1); MPASS(plen == sndptr->m_pkthdr.len); write_tx_data_iso(cpl_iso, ulp_submode, mbuf_iscsi_iso_flags(sndptr), iso_mss, plen, npdu); p = cpl_iso + 1; } else { write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, imm_data, adjusted_plen, credits, shove, ulp_submode); p = txwr + 1; } if (imm_data != 0) { m_copydata(sndptr, 0, plen, p); } else { write_tx_sgl(p, sndptr, m, nsegs, max_nsegs_1mbuf); if (wr_len & 0xf) { uint64_t *pad = (uint64_t *)((uintptr_t)txwr + wr_len); *pad = 0; } } KASSERT(toep->tx_credits >= credits, ("%s: not enough credits: credits %u " "toep->tx_credits %u tx_credits %u nsegs %u " "max_nsegs %u iso %d", __func__, credits, toep->tx_credits, tx_credits, nsegs, max_nsegs, iso)); tp->snd_nxt += adjusted_plen; tp->snd_max += adjusted_plen; counter_u64_add(toep->ofld_txq->tx_iscsi_pdus, npdu); counter_u64_add(toep->ofld_txq->tx_iscsi_octets, plen); if (iso) counter_u64_add(toep->ofld_txq->tx_iscsi_iso_wrs, 1); return (wr); } void t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop) { struct mbuf *sndptr, *m; struct fw_wr_hdr *wrhdr; struct wrqe *wr; u_int plen, credits; struct inpcb *inp = toep->inp; struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; struct mbufq *pduq = &toep->ulp_pduq; INP_WLOCK_ASSERT(inp); KASSERT(toep->flags & TPF_FLOWC_WR_SENT, ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); KASSERT(ulp_mode(toep) == ULP_MODE_ISCSI, ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) return; /* * This function doesn't resume by itself. Someone else must clear the * flag and call this function. */ if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { KASSERT(drop == 0, ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); return; } if (drop) { struct socket *so = inp->inp_socket; struct sockbuf *sb = &so->so_snd; int sbu; /* * An unlocked read is ok here as the data should only * transition from a non-zero value to either another * non-zero value or zero. Once it is zero it should * stay zero. */ if (__predict_false(sbused(sb)) > 0) { SOCKBUF_LOCK(sb); sbu = sbused(sb); if (sbu > 0) { /* * The data transmitted before the * tid's ULP mode changed to ISCSI is * still in so_snd. Incoming credits * should account for so_snd first. */ sbdrop_locked(sb, min(sbu, drop)); drop -= min(sbu, drop); } sowwakeup_locked(so); /* unlocks so_snd */ } rqdrop_locked(&toep->ulp_pdu_reclaimq, drop); } while ((sndptr = mbufq_first(pduq)) != NULL) { wr = write_iscsi_mbuf_wr(toep, sndptr); if (wr == NULL) { toep->flags |= TPF_TX_SUSPENDED; return; } plen = sndptr->m_pkthdr.len; credits = howmany(wr->wr_len, 16); KASSERT(toep->tx_credits >= credits, ("%s: not enough credits", __func__)); m = mbufq_dequeue(pduq); MPASS(m == sndptr); mbufq_enqueue(&toep->ulp_pdu_reclaimq, m); toep->tx_credits -= credits; toep->tx_nocompl += credits; toep->plen_nocompl += plen; /* * Ensure there are enough credits for a full-sized WR * as page pod WRs can be full-sized. */ if (toep->tx_credits <= SGE_MAX_WR_LEN * 5 / 4 && toep->tx_nocompl >= toep->tx_total / 4) { wrhdr = wrtod(wr); wrhdr->hi |= htobe32(F_FW_WR_COMPL); toep->tx_nocompl = 0; toep->plen_nocompl = 0; } toep->flags |= TPF_TX_DATA_SENT; if (toep->tx_credits < MIN_OFLD_TX_CREDITS) toep->flags |= TPF_TX_SUSPENDED; KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); KASSERT(plen <= MAX_OFLD_TX_SDESC_PLEN, ("%s: plen %u too large", __func__, plen)); txsd->plen = plen; txsd->tx_credits = credits; txsd++; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { toep->txsd_pidx = 0; txsd = &toep->txsd[0]; } toep->txsd_avail--; t4_l2t_send(sc, wr, toep->l2te); } /* Send a FIN if requested, but only if there are no more PDUs to send */ if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN) t4_close_conn(sc, toep); } static inline void t4_push_data(struct adapter *sc, struct toepcb *toep, int drop) { if (ulp_mode(toep) == ULP_MODE_ISCSI) t4_push_pdus(sc, toep, drop); else if (toep->flags & TPF_KTLS) t4_push_ktls(sc, toep, drop); else t4_push_frames(sc, toep, drop); } void t4_raw_wr_tx(struct adapter *sc, struct toepcb *toep, struct mbuf *m) { #ifdef INVARIANTS struct inpcb *inp = toep->inp; #endif INP_WLOCK_ASSERT(inp); /* * If there are other raw WRs enqueued, enqueue to preserve * FIFO ordering. */ if (!mbufq_empty(&toep->ulp_pduq)) { mbufq_enqueue(&toep->ulp_pduq, m); return; } /* * Cannot call t4_push_data here as that will lock so_snd and * some callers of this run in rx handlers with so_rcv locked. * Instead, just try to transmit this WR. */ if (!t4_push_raw_wr(sc, toep, m)) { mbufq_enqueue(&toep->ulp_pduq, m); toep->flags |= TPF_TX_SUSPENDED; } } int t4_tod_output(struct toedev *tod, struct tcpcb *tp) { struct adapter *sc = tod->tod_softc; #ifdef INVARIANTS struct inpcb *inp = tptoinpcb(tp); #endif struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(inp); KASSERT((inp->inp_flags & INP_DROPPED) == 0, ("%s: inp %p dropped.", __func__, inp)); KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); t4_push_data(sc, toep, 0); return (0); } int t4_send_fin(struct toedev *tod, struct tcpcb *tp) { struct adapter *sc = tod->tod_softc; #ifdef INVARIANTS struct inpcb *inp = tptoinpcb(tp); #endif struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(inp); KASSERT((inp->inp_flags & INP_DROPPED) == 0, ("%s: inp %p dropped.", __func__, inp)); KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); toep->flags |= TPF_SEND_FIN; if (tp->t_state >= TCPS_ESTABLISHED) t4_push_data(sc, toep, 0); return (0); } int t4_send_rst(struct toedev *tod, struct tcpcb *tp) { struct adapter *sc = tod->tod_softc; #if defined(INVARIANTS) struct inpcb *inp = tptoinpcb(tp); #endif struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(inp); KASSERT((inp->inp_flags & INP_DROPPED) == 0, ("%s: inp %p dropped.", __func__, inp)); KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); /* hmmmm */ KASSERT(toep->flags & TPF_FLOWC_WR_SENT, ("%s: flowc for tid %u [%s] not sent already", __func__, toep->tid, tcpstates[tp->t_state])); send_reset(sc, toep, 0); return (0); } /* * Peer has sent us a FIN. */ static int do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_peer_close *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp = toep->inp; struct tcpcb *tp = NULL; struct socket *so; struct epoch_tracker et; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_PEER_CLOSE, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); if (__predict_false(toep->flags & TPF_SYNQE)) { /* * do_pass_establish must have run before do_peer_close and if * this is still a synqe instead of a toepcb then the connection * must be getting aborted. */ MPASS(toep->flags & TPF_ABORT_SHUTDOWN); CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, toep, toep->flags); return (0); } KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); CURVNET_SET(toep->vnet); NET_EPOCH_ENTER(et); INP_WLOCK(inp); tp = intotcpcb(inp); CTR6(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x, ddp_flags 0x%x, inp %p", __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, toep->ddp.flags, inp); if (toep->flags & TPF_ABORT_SHUTDOWN) goto done; if (ulp_mode(toep) == ULP_MODE_TCPDDP) { DDP_LOCK(toep); if (__predict_false(toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE))) handle_ddp_close(toep, tp, cpl->rcv_nxt); DDP_UNLOCK(toep); } so = inp->inp_socket; socantrcvmore(so); if (ulp_mode(toep) == ULP_MODE_RDMA || (ulp_mode(toep) == ULP_MODE_ISCSI && chip_id(sc) >= CHELSIO_T6)) { /* * There might be data received via DDP before the FIN * not reported to the driver. Just assume the * sequence number in the CPL is correct as the * sequence number of the FIN. */ } else { KASSERT(tp->rcv_nxt + 1 == be32toh(cpl->rcv_nxt), ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt, be32toh(cpl->rcv_nxt))); } tp->rcv_nxt = be32toh(cpl->rcv_nxt); switch (tp->t_state) { case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; /* FALLTHROUGH */ case TCPS_ESTABLISHED: tcp_state_change(tp, TCPS_CLOSE_WAIT); break; case TCPS_FIN_WAIT_1: tcp_state_change(tp, TCPS_CLOSING); break; case TCPS_FIN_WAIT_2: restore_so_proto(so, inp->inp_vflag & INP_IPV6); t4_pcb_detach(NULL, tp); tcp_twstart(tp); INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ NET_EPOCH_EXIT(et); CURVNET_RESTORE(); INP_WLOCK(inp); final_cpl_received(toep); return (0); default: log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n", __func__, tid, tp->t_state); } done: INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } /* * Peer has ACK'd our FIN. */ static int do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp = toep->inp; struct tcpcb *tp = NULL; struct socket *so = NULL; struct epoch_tracker et; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_CLOSE_CON_RPL, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); CURVNET_SET(toep->vnet); NET_EPOCH_ENTER(et); INP_WLOCK(inp); tp = intotcpcb(inp); CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x", __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags); if (toep->flags & TPF_ABORT_SHUTDOWN) goto done; so = inp->inp_socket; tp->snd_una = be32toh(cpl->snd_nxt) - 1; /* exclude FIN */ switch (tp->t_state) { case TCPS_CLOSING: /* see TCPS_FIN_WAIT_2 in do_peer_close too */ restore_so_proto(so, inp->inp_vflag & INP_IPV6); t4_pcb_detach(NULL, tp); tcp_twstart(tp); release: INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ NET_EPOCH_EXIT(et); CURVNET_RESTORE(); INP_WLOCK(inp); final_cpl_received(toep); /* no more CPLs expected */ return (0); case TCPS_LAST_ACK: if (tcp_close(tp)) INP_WUNLOCK(inp); goto release; case TCPS_FIN_WAIT_1: if (so->so_rcv.sb_state & SBS_CANTRCVMORE) soisdisconnected(so); tcp_state_change(tp, TCPS_FIN_WAIT_2); break; default: log(LOG_ERR, "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n", __func__, tid, tcpstates[tp->t_state]); } done: INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } void send_abort_rpl(struct adapter *sc, struct sge_ofld_txq *ofld_txq, int tid, int rst_status) { struct wrqe *wr; struct cpl_abort_rpl *cpl; wr = alloc_wrqe(sizeof(*cpl), &ofld_txq->wrq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } cpl = wrtod(wr); INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid); cpl->cmd = rst_status; t4_wrq_tx(sc, wr); } static int abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason) { switch (abort_reason) { case CPL_ERR_BAD_SYN: case CPL_ERR_CONN_RESET: return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); case CPL_ERR_XMIT_TIMEDOUT: case CPL_ERR_PERSIST_TIMEDOUT: case CPL_ERR_FINWAIT2_TIMEDOUT: case CPL_ERR_KEEPALIVE_TIMEDOUT: return (ETIMEDOUT); default: return (EIO); } } /* * TCP RST from the peer, timeout, or some other such critical error. */ static int do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct sge_ofld_txq *ofld_txq = toep->ofld_txq; struct inpcb *inp; struct tcpcb *tp; struct epoch_tracker et; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_ABORT_REQ_RSS, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); if (toep->flags & TPF_SYNQE) return (do_abort_req_synqe(iq, rss, m)); KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); if (negative_advice(cpl->status)) { CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)", __func__, cpl->status, tid, toep->flags); return (0); /* Ignore negative advice */ } inp = toep->inp; CURVNET_SET(toep->vnet); NET_EPOCH_ENTER(et); /* for tcp_close */ INP_WLOCK(inp); tp = intotcpcb(inp); CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d", __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, inp->inp_flags, cpl->status); /* * If we'd initiated an abort earlier the reply to it is responsible for * cleaning up resources. Otherwise we tear everything down right here * right now. We owe the T4 a CPL_ABORT_RPL no matter what. */ if (toep->flags & TPF_ABORT_SHUTDOWN) { INP_WUNLOCK(inp); goto done; } toep->flags |= TPF_ABORT_SHUTDOWN; if ((inp->inp_flags & INP_DROPPED) == 0) { struct socket *so = inp->inp_socket; if (so != NULL) so_error_set(so, abort_status_to_errno(tp, cpl->status)); tp = tcp_close(tp); if (tp == NULL) INP_WLOCK(inp); /* re-acquire */ } final_cpl_received(toep); done: NET_EPOCH_EXIT(et); CURVNET_RESTORE(); send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); return (0); } /* * Reply to the CPL_ABORT_REQ (send_reset) */ static int do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp = toep->inp; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_ABORT_RPL_RSS, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); if (toep->flags & TPF_SYNQE) return (do_abort_rpl_synqe(iq, rss, m)); KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d", __func__, tid, toep, inp, cpl->status); KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, ("%s: wasn't expecting abort reply", __func__)); INP_WLOCK(inp); final_cpl_received(toep); return (0); } static int do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_rx_data *cpl = mtod(m, const void *); unsigned int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp = toep->inp; struct tcpcb *tp; struct socket *so; struct sockbuf *sb; struct epoch_tracker et; int len; uint32_t ddp_placed = 0; if (__predict_false(toep->flags & TPF_SYNQE)) { /* * do_pass_establish must have run before do_rx_data and if this * is still a synqe instead of a toepcb then the connection must * be getting aborted. */ MPASS(toep->flags & TPF_ABORT_SHUTDOWN); CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, toep, toep->flags); m_freem(m); return (0); } KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); /* strip off CPL header */ m_adj(m, sizeof(*cpl)); len = m->m_pkthdr.len; INP_WLOCK(inp); if (inp->inp_flags & INP_DROPPED) { CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", __func__, tid, len, inp->inp_flags); INP_WUNLOCK(inp); m_freem(m); return (0); } tp = intotcpcb(inp); if (__predict_false(ulp_mode(toep) == ULP_MODE_TLS && toep->flags & TPF_TLS_RECEIVE)) { /* Received "raw" data on a TLS socket. */ CTR3(KTR_CXGBE, "%s: tid %u, raw TLS data (%d bytes)", __func__, tid, len); do_rx_data_tls(cpl, toep, m); return (0); } if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq))) ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt; tp->rcv_nxt += len; if (tp->rcv_wnd < len) { KASSERT(ulp_mode(toep) == ULP_MODE_RDMA, ("%s: negative window size", __func__)); } tp->rcv_wnd -= len; tp->t_rcvtime = ticks; if (ulp_mode(toep) == ULP_MODE_TCPDDP) DDP_LOCK(toep); so = inp_inpcbtosocket(inp); sb = &so->so_rcv; SOCKBUF_LOCK(sb); if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) { CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)", __func__, tid, len); m_freem(m); SOCKBUF_UNLOCK(sb); if (ulp_mode(toep) == ULP_MODE_TCPDDP) DDP_UNLOCK(toep); INP_WUNLOCK(inp); CURVNET_SET(toep->vnet); NET_EPOCH_ENTER(et); INP_WLOCK(inp); tp = tcp_drop(tp, ECONNRESET); if (tp) INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } /* receive buffer autosize */ MPASS(toep->vnet == so->so_vnet); CURVNET_SET(toep->vnet); if (sb->sb_flags & SB_AUTOSIZE && V_tcp_do_autorcvbuf && sb->sb_hiwat < V_tcp_autorcvbuf_max && len > (sbspace(sb) / 8 * 7)) { unsigned int hiwat = sb->sb_hiwat; unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc, V_tcp_autorcvbuf_max); if (!sbreserve_locked(so, SO_RCV, newsize, NULL)) sb->sb_flags &= ~SB_AUTOSIZE; } if (ulp_mode(toep) == ULP_MODE_TCPDDP) { int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off; if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0) CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)", __func__, tid, len); if (changed) { if (toep->ddp.flags & DDP_SC_REQ) toep->ddp.flags ^= DDP_ON | DDP_SC_REQ; else if (cpl->ddp_off == 1) { /* Fell out of DDP mode */ toep->ddp.flags &= ~DDP_ON; CTR1(KTR_CXGBE, "%s: fell out of DDP mode", __func__); insert_ddp_data(toep, ddp_placed); } else { /* * Data was received while still * ULP_MODE_NONE, just fall through. */ } } if (toep->ddp.flags & DDP_ON) { /* * CPL_RX_DATA with DDP on can only be an indicate. * Start posting queued AIO requests via DDP. The * payload that arrived in this indicate is appended * to the socket buffer as usual. */ handle_ddp_indicate(toep); } } sbappendstream_locked(sb, m, 0); t4_rcvd_locked(&toep->td->tod, tp); if (ulp_mode(toep) == ULP_MODE_TCPDDP && (toep->ddp.flags & DDP_AIO) != 0 && toep->ddp.waiting_count > 0 && sbavail(sb) != 0) { CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__, tid); ddp_queue_toep(toep); } if (toep->flags & TPF_TLS_STARTING) tls_received_starting_data(sc, toep, sb, len); sorwakeup_locked(so); SOCKBUF_UNLOCK_ASSERT(sb); if (ulp_mode(toep) == ULP_MODE_TCPDDP) DDP_UNLOCK(toep); INP_WUNLOCK(inp); CURVNET_RESTORE(); return (0); } static int do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp; struct tcpcb *tp; struct socket *so; uint8_t credits = cpl->credits; struct ofld_tx_sdesc *txsd; int plen; #ifdef INVARIANTS unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl))); #endif /* * Very unusual case: we'd sent a flowc + abort_req for a synq entry and * now this comes back carrying the credits for the flowc. */ if (__predict_false(toep->flags & TPF_SYNQE)) { KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, ("%s: credits for a synq entry %p", __func__, toep)); return (0); } inp = toep->inp; KASSERT(opcode == CPL_FW4_ACK, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); INP_WLOCK(inp); if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) { INP_WUNLOCK(inp); return (0); } KASSERT((inp->inp_flags & INP_DROPPED) == 0, ("%s: inp_flags 0x%x", __func__, inp->inp_flags)); tp = intotcpcb(inp); if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) { tcp_seq snd_una = be32toh(cpl->snd_una); #ifdef INVARIANTS if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { log(LOG_ERR, "%s: unexpected seq# %x for TID %u, snd_una %x\n", __func__, snd_una, toep->tid, tp->snd_una); } #endif if (tp->snd_una != snd_una) { tp->snd_una = snd_una; tp->ts_recent_age = tcp_ts_getticks(); } } #ifdef VERBOSE_TRACES CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits); #endif so = inp->inp_socket; txsd = &toep->txsd[toep->txsd_cidx]; plen = 0; while (credits) { KASSERT(credits >= txsd->tx_credits, ("%s: too many (or partial) credits", __func__)); credits -= txsd->tx_credits; toep->tx_credits += txsd->tx_credits; plen += txsd->plen; txsd++; toep->txsd_avail++; KASSERT(toep->txsd_avail <= toep->txsd_total, ("%s: txsd avail > total", __func__)); if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) { txsd = &toep->txsd[0]; toep->txsd_cidx = 0; } } if (toep->tx_credits == toep->tx_total) { toep->tx_nocompl = 0; toep->plen_nocompl = 0; } if (toep->flags & TPF_TX_SUSPENDED && toep->tx_credits >= toep->tx_total / 4) { #ifdef VERBOSE_TRACES CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__, tid); #endif toep->flags &= ~TPF_TX_SUSPENDED; CURVNET_SET(toep->vnet); t4_push_data(sc, toep, plen); CURVNET_RESTORE(); } else if (plen > 0) { struct sockbuf *sb = &so->so_snd; int sbu; SOCKBUF_LOCK(sb); sbu = sbused(sb); if (ulp_mode(toep) == ULP_MODE_ISCSI) { if (__predict_false(sbu > 0)) { /* * The data transmitted before the * tid's ULP mode changed to ISCSI is * still in so_snd. Incoming credits * should account for so_snd first. */ sbdrop_locked(sb, min(sbu, plen)); plen -= min(sbu, plen); } sowwakeup_locked(so); /* unlocks so_snd */ rqdrop_locked(&toep->ulp_pdu_reclaimq, plen); } else { #ifdef VERBOSE_TRACES CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__, tid, plen); #endif sbdrop_locked(sb, plen); if (!TAILQ_EMPTY(&toep->aiotx_jobq)) t4_aiotx_queue_toep(so, toep); sowwakeup_locked(so); /* unlocks so_snd */ } SOCKBUF_UNLOCK_ASSERT(sb); } INP_WUNLOCK(inp); return (0); } void write_set_tcb_field(struct adapter *sc, void *dst, struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie) { struct cpl_set_tcb_field *req = dst; MPASS((cookie & ~M_COOKIE) == 0); if (reply) { MPASS(cookie != CPL_COOKIE_RESERVED); } INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid); - req->reply_ctrl = htobe16(V_QUEUENO(toep->ofld_rxq->iq.abs_id)); - if (reply == 0) - req->reply_ctrl |= htobe16(F_NO_REPLY); + if (reply == 0) { + req->reply_ctrl = htobe16(F_NO_REPLY); + } else { + const int qid = toep->ofld_rxq->iq.abs_id; + if (chip_id(sc) >= CHELSIO_T7) { + req->reply_ctrl = htobe16(V_T7_QUEUENO(qid) | + V_T7_REPLY_CHAN(0) | V_NO_REPLY(0)); + } else { + req->reply_ctrl = htobe16(V_QUEUENO(qid) | + V_REPLY_CHAN(0) | V_NO_REPLY(0)); + } + } req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie)); req->mask = htobe64(mask); req->val = htobe64(val); } void t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie) { struct wrqe *wr; struct ofld_tx_sdesc *txsd; const u_int len = sizeof(struct cpl_set_tcb_field); wr = alloc_wrqe(len, wrq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } write_set_tcb_field(sc, wrtod(wr), toep, word, mask, val, reply, cookie); if (wrq->eq.type == EQ_OFLD) { txsd = &toep->txsd[toep->txsd_pidx]; _Static_assert(howmany(len, 16) <= MAX_OFLD_TX_SDESC_CREDITS, "MAX_OFLD_TX_SDESC_CREDITS too small"); txsd->tx_credits = howmany(len, 16); txsd->plen = 0; KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, ("%s: not enough credits (%d)", __func__, toep->tx_credits)); toep->tx_credits -= txsd->tx_credits; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) toep->txsd_pidx = 0; toep->txsd_avail--; } t4_wrq_tx(sc, wr); } void t4_init_cpl_io_handlers(void) { t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl, CPL_COOKIE_TOM); t4_register_cpl_handler(CPL_RX_DATA, do_rx_data); t4_register_shared_cpl_handler(CPL_FW4_ACK, do_fw4_ack, CPL_COOKIE_TOM); } void t4_uninit_cpl_io_handlers(void) { t4_register_cpl_handler(CPL_PEER_CLOSE, NULL); t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL); t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL); t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, NULL, CPL_COOKIE_TOM); t4_register_cpl_handler(CPL_RX_DATA, NULL); t4_register_shared_cpl_handler(CPL_FW4_ACK, NULL, CPL_COOKIE_TOM); } /* * Use the 'backend1' field in AIO jobs to hold an error that should * be reported when the job is completed, the 'backend3' field to * store the amount of data sent by the AIO job so far, and the * 'backend4' field to hold a reference count on the job. * * Each unmapped mbuf holds a reference on the job as does the queue * so long as the job is queued. */ #define aio_error backend1 #define aio_sent backend3 #define aio_refs backend4 #ifdef VERBOSE_TRACES static int jobtotid(struct kaiocb *job) { struct socket *so; struct tcpcb *tp; struct toepcb *toep; so = job->fd_file->f_data; tp = sototcpcb(so); toep = tp->t_toe; return (toep->tid); } #endif static void aiotx_free_job(struct kaiocb *job) { long status; int error; if (refcount_release(&job->aio_refs) == 0) return; error = (intptr_t)job->aio_error; status = job->aio_sent; #ifdef VERBOSE_TRACES CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__, jobtotid(job), job, status, error); #endif if (error != 0 && status != 0) error = 0; if (error == ECANCELED) aio_cancel(job); else if (error) aio_complete(job, -1, error); else { job->msgsnd = 1; aio_complete(job, status, 0); } } static void aiotx_free_pgs(struct mbuf *m) { struct kaiocb *job; vm_page_t pg; M_ASSERTEXTPG(m); job = m->m_ext.ext_arg1; #ifdef VERBOSE_TRACES CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__, m->m_len, jobtotid(job)); #endif for (int i = 0; i < m->m_epg_npgs; i++) { pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); vm_page_unwire(pg, PQ_ACTIVE); } aiotx_free_job(job); } /* * Allocate a chain of unmapped mbufs describing the next 'len' bytes * of an AIO job. */ static struct mbuf * alloc_aiotx_mbuf(struct kaiocb *job, int len) { struct vmspace *vm; vm_page_t pgs[MBUF_PEXT_MAX_PGS]; struct mbuf *m, *top, *last; vm_map_t map; vm_offset_t start; int i, mlen, npages, pgoff; KASSERT(job->aio_sent + len <= job->uaiocb.aio_nbytes, ("%s(%p, %d): request to send beyond end of buffer", __func__, job, len)); /* * The AIO subsystem will cancel and drain all requests before * permitting a process to exit or exec, so p_vmspace should * be stable here. */ vm = job->userproc->p_vmspace; map = &vm->vm_map; start = (uintptr_t)job->uaiocb.aio_buf + job->aio_sent; pgoff = start & PAGE_MASK; top = NULL; last = NULL; while (len > 0) { mlen = imin(len, MBUF_PEXT_MAX_PGS * PAGE_SIZE - pgoff); KASSERT(mlen == len || ((start + mlen) & PAGE_MASK) == 0, ("%s: next start (%#jx + %#x) is not page aligned", __func__, (uintmax_t)start, mlen)); npages = vm_fault_quick_hold_pages(map, start, mlen, VM_PROT_WRITE, pgs, nitems(pgs)); if (npages < 0) break; m = mb_alloc_ext_pgs(M_WAITOK, aiotx_free_pgs, M_RDONLY); m->m_epg_1st_off = pgoff; m->m_epg_npgs = npages; if (npages == 1) { KASSERT(mlen + pgoff <= PAGE_SIZE, ("%s: single page is too large (off %d len %d)", __func__, pgoff, mlen)); m->m_epg_last_len = mlen; } else { m->m_epg_last_len = mlen - (PAGE_SIZE - pgoff) - (npages - 2) * PAGE_SIZE; } for (i = 0; i < npages; i++) m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pgs[i]); m->m_len = mlen; m->m_ext.ext_size = npages * PAGE_SIZE; m->m_ext.ext_arg1 = job; refcount_acquire(&job->aio_refs); #ifdef VERBOSE_TRACES CTR5(KTR_CXGBE, "%s: tid %d, new mbuf %p for job %p, npages %d", __func__, jobtotid(job), m, job, npages); #endif if (top == NULL) top = m; else last->m_next = m; last = m; len -= mlen; start += mlen; pgoff = 0; } return (top); } static void t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job) { struct sockbuf *sb; struct inpcb *inp; struct tcpcb *tp; struct mbuf *m; u_int sent; int error, len; bool moretocome, sendmore; sb = &so->so_snd; SOCKBUF_UNLOCK(sb); m = NULL; #ifdef MAC error = mac_socket_check_send(job->fd_file->f_cred, so); if (error != 0) goto out; #endif /* Inline sosend_generic(). */ error = SOCK_IO_SEND_LOCK(so, SBL_WAIT); MPASS(error == 0); sendanother: SOCKBUF_LOCK(sb); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(sb); SOCK_IO_SEND_UNLOCK(so); if ((so->so_options & SO_NOSIGPIPE) == 0) { PROC_LOCK(job->userproc); kern_psignal(job->userproc, SIGPIPE); PROC_UNLOCK(job->userproc); } error = EPIPE; goto out; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(sb); SOCK_IO_SEND_UNLOCK(so); goto out; } if ((so->so_state & SS_ISCONNECTED) == 0) { SOCKBUF_UNLOCK(sb); SOCK_IO_SEND_UNLOCK(so); error = ENOTCONN; goto out; } if (sbspace(sb) < sb->sb_lowat) { MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO)); /* * Don't block if there is too little room in the socket * buffer. Instead, requeue the request. */ if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { SOCKBUF_UNLOCK(sb); SOCK_IO_SEND_UNLOCK(so); error = ECANCELED; goto out; } TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); SOCKBUF_UNLOCK(sb); SOCK_IO_SEND_UNLOCK(so); goto out; } /* * Write as much data as the socket permits, but no more than a * a single sndbuf at a time. */ len = sbspace(sb); if (len > job->uaiocb.aio_nbytes - job->aio_sent) { len = job->uaiocb.aio_nbytes - job->aio_sent; moretocome = false; } else moretocome = true; if (len > toep->params.sndbuf) { len = toep->params.sndbuf; sendmore = true; } else sendmore = false; if (!TAILQ_EMPTY(&toep->aiotx_jobq)) moretocome = true; SOCKBUF_UNLOCK(sb); MPASS(len != 0); m = alloc_aiotx_mbuf(job, len); if (m == NULL) { SOCK_IO_SEND_UNLOCK(so); error = EFAULT; goto out; } /* Inlined tcp_usr_send(). */ inp = toep->inp; INP_WLOCK(inp); if (inp->inp_flags & INP_DROPPED) { INP_WUNLOCK(inp); SOCK_IO_SEND_UNLOCK(so); error = ECONNRESET; goto out; } sent = m_length(m, NULL); job->aio_sent += sent; counter_u64_add(toep->ofld_txq->tx_aio_octets, sent); sbappendstream(sb, m, 0); m = NULL; if (!(inp->inp_flags & INP_DROPPED)) { tp = intotcpcb(inp); if (moretocome) tp->t_flags |= TF_MORETOCOME; error = tcp_output(tp); if (error < 0) { INP_UNLOCK_ASSERT(inp); SOCK_IO_SEND_UNLOCK(so); error = -error; goto out; } if (moretocome) tp->t_flags &= ~TF_MORETOCOME; } INP_WUNLOCK(inp); if (sendmore) goto sendanother; SOCK_IO_SEND_UNLOCK(so); if (error) goto out; /* * If this is a blocking socket and the request has not been * fully completed, requeue it until the socket is ready * again. */ if (job->aio_sent < job->uaiocb.aio_nbytes && !(so->so_state & SS_NBIO)) { SOCKBUF_LOCK(sb); if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { SOCKBUF_UNLOCK(sb); error = ECANCELED; goto out; } TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); return; } /* * If the request will not be requeued, drop the queue's * reference to the job. Any mbufs in flight should still * hold a reference, but this drops the reference that the * queue owns while it is waiting to queue mbufs to the * socket. */ aiotx_free_job(job); counter_u64_add(toep->ofld_txq->tx_aio_jobs, 1); out: if (error) { job->aio_error = (void *)(intptr_t)error; aiotx_free_job(job); } m_freem(m); SOCKBUF_LOCK(sb); } static void t4_aiotx_task(void *context, int pending) { struct toepcb *toep = context; struct socket *so; struct kaiocb *job; struct epoch_tracker et; so = toep->aiotx_so; CURVNET_SET(toep->vnet); NET_EPOCH_ENTER(et); SOCKBUF_LOCK(&so->so_snd); while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) { job = TAILQ_FIRST(&toep->aiotx_jobq); TAILQ_REMOVE(&toep->aiotx_jobq, job, list); if (!aio_clear_cancel_function(job)) continue; t4_aiotx_process_job(toep, so, job); } toep->aiotx_so = NULL; SOCKBUF_UNLOCK(&so->so_snd); NET_EPOCH_EXIT(et); free_toepcb(toep); sorele(so); CURVNET_RESTORE(); } static void t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep) { SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd); #ifdef VERBOSE_TRACES CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s", __func__, toep->tid, toep->aiotx_so != NULL ? "true" : "false"); #endif if (toep->aiotx_so != NULL) return; soref(so); toep->aiotx_so = so; hold_toepcb(toep); soaio_enqueue(&toep->aiotx_task); } static void t4_aiotx_cancel(struct kaiocb *job) { struct socket *so; struct sockbuf *sb; struct tcpcb *tp; struct toepcb *toep; so = job->fd_file->f_data; tp = sototcpcb(so); toep = tp->t_toe; MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE); sb = &so->so_snd; SOCKBUF_LOCK(sb); if (!aio_cancel_cleared(job)) TAILQ_REMOVE(&toep->aiotx_jobq, job, list); SOCKBUF_UNLOCK(sb); job->aio_error = (void *)(intptr_t)ECANCELED; aiotx_free_job(job); } int t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job) { struct tcpcb *tp = sototcpcb(so); struct toepcb *toep = tp->t_toe; struct adapter *sc = td_adapter(toep->td); /* This only handles writes. */ if (job->uaiocb.aio_lio_opcode != LIO_WRITE) return (EOPNOTSUPP); if (!sc->tt.tx_zcopy) return (EOPNOTSUPP); if (tls_tx_key(toep)) return (EOPNOTSUPP); SOCKBUF_LOCK(&so->so_snd); #ifdef VERBOSE_TRACES CTR3(KTR_CXGBE, "%s: queueing %p for tid %u", __func__, job, toep->tid); #endif if (!aio_set_cancel_function(job, t4_aiotx_cancel)) panic("new job was cancelled"); refcount_init(&job->aio_refs, 1); TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list); if (sowriteable(so)) t4_aiotx_queue_toep(so, toep); SOCKBUF_UNLOCK(&so->so_snd); return (0); } void aiotx_init_toep(struct toepcb *toep) { TAILQ_INIT(&toep->aiotx_jobq); TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep); } #endif diff --git a/sys/dev/cxgbe/tom/t4_tom.c b/sys/dev/cxgbe/tom/t4_tom.c index 82d8f724b382..18d73afc47c5 100644 --- a/sys/dev/cxgbe/tom/t4_tom.c +++ b/sys/dev/cxgbe/tom/t4_tom.c @@ -1,2361 +1,2368 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_ratelimit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include #include #include #include #include #include #ifdef TCP_OFFLOAD #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "common/t4_tcb.h" #include "t4_clip.h" #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" #include "tom/t4_tls.h" static struct protosw toe_protosw; static struct protosw toe6_protosw; /* Module ops */ static int t4_tom_mod_load(void); static int t4_tom_mod_unload(void); static int t4_tom_modevent(module_t, int, void *); /* ULD ops and helpers */ static int t4_tom_activate(struct adapter *); static int t4_tom_deactivate(struct adapter *); static int t4_tom_stop(struct adapter *); static int t4_tom_restart(struct adapter *); static struct uld_info tom_uld_info = { .uld_activate = t4_tom_activate, .uld_deactivate = t4_tom_deactivate, .uld_stop = t4_tom_stop, .uld_restart = t4_tom_restart, }; static void release_offload_resources(struct toepcb *); static void done_with_toepcb(struct toepcb *); static int alloc_tid_tabs(struct adapter *); static void free_tid_tabs(struct adapter *); static void free_tom_data(struct adapter *, struct tom_data *); static void reclaim_wr_resources(void *, int); static void cleanup_stranded_tids(void *, int); struct toepcb * alloc_toepcb(struct vi_info *vi, int flags) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct toepcb *toep; int tx_credits, txsd_total, len; /* * The firmware counts tx work request credits in units of 16 bytes * each. Reserve room for an ABORT_REQ so the driver never has to worry * about tx credits if it wants to abort a connection. */ tx_credits = sc->params.ofldq_wr_cred; tx_credits -= howmany(sizeof(struct cpl_abort_req), 16); /* * Shortest possible tx work request is a fw_ofld_tx_data_wr + 1 byte * immediate payload, and firmware counts tx work request credits in * units of 16 byte. Calculate the maximum work requests possible. */ txsd_total = tx_credits / howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16); len = offsetof(struct toepcb, txsd) + txsd_total * sizeof(struct ofld_tx_sdesc); toep = malloc(len, M_CXGBE, M_ZERO | flags); if (toep == NULL) return (NULL); refcount_init(&toep->refcount, 1); toep->td = sc->tom_softc; toep->incarnation = sc->incarnation; toep->vi = vi; toep->tid = -1; toep->tx_total = tx_credits; toep->tx_credits = tx_credits; mbufq_init(&toep->ulp_pduq, INT_MAX); mbufq_init(&toep->ulp_pdu_reclaimq, INT_MAX); toep->txsd_total = txsd_total; toep->txsd_avail = txsd_total; toep->txsd_pidx = 0; toep->txsd_cidx = 0; aiotx_init_toep(toep); return (toep); } /* * Initialize a toepcb after its params have been filled out. */ int init_toepcb(struct vi_info *vi, struct toepcb *toep) { struct conn_params *cp = &toep->params; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct tx_cl_rl_params *tc; if (cp->tc_idx >= 0 && cp->tc_idx < sc->params.nsched_cls) { tc = &pi->sched_params->cl_rl[cp->tc_idx]; mtx_lock(&sc->tc_lock); if (tc->state != CS_HW_CONFIGURED) { CH_ERR(vi, "tid %d cannot be bound to traffic class %d " "because it is not configured (its state is %d)\n", toep->tid, cp->tc_idx, tc->state); cp->tc_idx = -1; } else { tc->refcount++; } mtx_unlock(&sc->tc_lock); } toep->ofld_txq = &sc->sge.ofld_txq[cp->txq_idx]; toep->ofld_rxq = &sc->sge.ofld_rxq[cp->rxq_idx]; toep->ctrlq = &sc->sge.ctrlq[cp->ctrlq_idx]; tls_init_toep(toep); MPASS(ulp_mode(toep) != ULP_MODE_TCPDDP); toep->flags |= TPF_INITIALIZED; return (0); } struct toepcb * hold_toepcb(struct toepcb *toep) { refcount_acquire(&toep->refcount); return (toep); } void free_toepcb(struct toepcb *toep) { if (refcount_release(&toep->refcount) == 0) return; KASSERT(!(toep->flags & TPF_ATTACHED), ("%s: attached to an inpcb", __func__)); KASSERT(!(toep->flags & TPF_CPL_PENDING), ("%s: CPL pending", __func__)); if (toep->flags & TPF_INITIALIZED) { if (ulp_mode(toep) == ULP_MODE_TCPDDP) ddp_uninit_toep(toep); tls_uninit_toep(toep); } free(toep, M_CXGBE); } /* * Set up the socket for TCP offload. */ void offload_socket(struct socket *so, struct toepcb *toep) { struct tom_data *td = toep->td; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct sockbuf *sb; INP_WLOCK_ASSERT(inp); /* Update socket */ sb = &so->so_snd; SOCKBUF_LOCK(sb); sb->sb_flags |= SB_NOCOALESCE; SOCKBUF_UNLOCK(sb); sb = &so->so_rcv; SOCKBUF_LOCK(sb); sb->sb_flags |= SB_NOCOALESCE; if (inp->inp_vflag & INP_IPV6) so->so_proto = &toe6_protosw; else so->so_proto = &toe_protosw; SOCKBUF_UNLOCK(sb); /* Update TCP PCB */ tp->tod = &td->tod; tp->t_toe = toep; tp->t_flags |= TF_TOE; /* Install an extra hold on inp */ toep->inp = inp; toep->flags |= TPF_ATTACHED; in_pcbref(inp); } void restore_so_proto(struct socket *so, bool v6) { if (v6) so->so_proto = &tcp6_protosw; else so->so_proto = &tcp_protosw; } /* This is _not_ the normal way to "unoffload" a socket. */ void undo_offload_socket(struct socket *so) { struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct toepcb *toep = tp->t_toe; struct sockbuf *sb; INP_WLOCK_ASSERT(inp); sb = &so->so_snd; SOCKBUF_LOCK(sb); sb->sb_flags &= ~SB_NOCOALESCE; SOCKBUF_UNLOCK(sb); sb = &so->so_rcv; SOCKBUF_LOCK(sb); sb->sb_flags &= ~SB_NOCOALESCE; restore_so_proto(so, inp->inp_vflag & INP_IPV6); SOCKBUF_UNLOCK(sb); tp->tod = NULL; tp->t_toe = NULL; tp->t_flags &= ~TF_TOE; toep->inp = NULL; toep->flags &= ~TPF_ATTACHED; if (in_pcbrele_wlocked(inp)) panic("%s: inp freed.", __func__); } static void release_offload_resources(struct toepcb *toep) { struct tom_data *td = toep->td; struct adapter *sc = td_adapter(td); int tid = toep->tid; KASSERT(!(toep->flags & TPF_CPL_PENDING), ("%s: %p has CPL pending.", __func__, toep)); CTR5(KTR_CXGBE, "%s: toep %p (tid %d, l2te %p, ce %p)", __func__, toep, tid, toep->l2te, toep->ce); if (toep->l2te) { t4_l2t_release(toep->l2te); toep->l2te = NULL; } if (tid >= 0) { remove_tid(sc, tid, toep->ce ? 2 : 1); release_tid(sc, tid, toep->ctrlq); toep->tid = -1; mtx_lock(&td->toep_list_lock); if (toep->flags & TPF_IN_TOEP_LIST) { toep->flags &= ~TPF_IN_TOEP_LIST; TAILQ_REMOVE(&td->toep_list, toep, link); } mtx_unlock(&td->toep_list_lock); } if (toep->ce) { t4_release_clip_entry(sc, toep->ce); toep->ce = NULL; } if (toep->params.tc_idx != -1) t4_release_cl_rl(sc, toep->vi->pi->port_id, toep->params.tc_idx); } /* * Both the driver and kernel are done with the toepcb. */ static void done_with_toepcb(struct toepcb *toep) { KASSERT(!(toep->flags & TPF_CPL_PENDING), ("%s: %p has CPL pending.", __func__, toep)); KASSERT(!(toep->flags & TPF_ATTACHED), ("%s: %p is still attached.", __func__, toep)); CTR(KTR_CXGBE, "%s: toep %p (0x%x)", __func__, toep, toep->flags); /* * These queues should have been emptied at approximately the same time * that a normal connection's socket's so_snd would have been purged or * drained. Do _not_ clean up here. */ MPASS(mbufq_empty(&toep->ulp_pduq)); MPASS(mbufq_empty(&toep->ulp_pdu_reclaimq)); #ifdef INVARIANTS if (ulp_mode(toep) == ULP_MODE_TCPDDP) ddp_assert_empty(toep); #endif MPASS(TAILQ_EMPTY(&toep->aiotx_jobq)); MPASS(toep->tid == -1); MPASS(toep->l2te == NULL); MPASS(toep->ce == NULL); MPASS((toep->flags & TPF_IN_TOEP_LIST) == 0); free_toepcb(toep); } /* * The kernel is done with the TCP PCB and this is our opportunity to unhook the * toepcb hanging off of it. If the TOE driver is also done with the toepcb (no * pending CPL) then it is time to release all resources tied to the toepcb. * * Also gets called when an offloaded active open fails and the TOM wants the * kernel to take the TCP PCB back. */ void t4_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp) { #if defined(KTR) || defined(INVARIANTS) struct inpcb *inp = tptoinpcb(tp); #endif struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(inp); KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); KASSERT(toep->flags & TPF_ATTACHED, ("%s: not attached", __func__)); #ifdef KTR if (tp->t_state == TCPS_SYN_SENT) { CTR6(KTR_CXGBE, "%s: atid %d, toep %p (0x%x), inp %p (0x%x)", __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags); } else { CTR6(KTR_CXGBE, "t4_pcb_detach: tid %d (%s), toep %p (0x%x), inp %p (0x%x)", toep->tid, tcpstates[tp->t_state], toep, toep->flags, inp, inp->inp_flags); } #endif tp->tod = NULL; tp->t_toe = NULL; tp->t_flags &= ~TF_TOE; toep->flags &= ~TPF_ATTACHED; if (!(toep->flags & TPF_CPL_PENDING)) done_with_toepcb(toep); } /* * setsockopt handler. */ static void t4_ctloutput(struct toedev *tod, struct tcpcb *tp, int dir, int name) { struct adapter *sc = tod->tod_softc; struct toepcb *toep = tp->t_toe; if (dir == SOPT_GET) return; CTR4(KTR_CXGBE, "%s: tp %p, dir %u, name %u", __func__, tp, dir, name); switch (name) { case TCP_NODELAY: if (tp->t_state != TCPS_ESTABLISHED) break; toep->params.nagle = tp->t_flags & TF_NODELAY ? 0 : 1; t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS, V_TF_NAGLE(1), V_TF_NAGLE(toep->params.nagle), 0, 0); break; default: break; } } static inline uint64_t get_tcb_tflags(const uint64_t *tcb) { return ((be64toh(tcb[14]) << 32) | (be64toh(tcb[15]) >> 32)); } static inline uint32_t get_tcb_field(const uint64_t *tcb, u_int word, uint32_t mask, u_int shift) { #define LAST_WORD ((TCB_SIZE / 4) - 1) uint64_t t1, t2; int flit_idx; MPASS(mask != 0); MPASS(word <= LAST_WORD); MPASS(shift < 32); flit_idx = (LAST_WORD - word) / 2; if (word & 0x1) shift += 32; t1 = be64toh(tcb[flit_idx]) >> shift; t2 = 0; if (fls(mask) > 64 - shift) { /* * Will spill over into the next logical flit, which is the flit * before this one. The flit_idx before this one must be valid. */ MPASS(flit_idx > 0); t2 = be64toh(tcb[flit_idx - 1]) << (64 - shift); } return ((t2 | t1) & mask); #undef LAST_WORD } #define GET_TCB_FIELD(tcb, F) \ get_tcb_field(tcb, W_TCB_##F, M_TCB_##F, S_TCB_##F) /* * Issues a CPL_GET_TCB to read the entire TCB for the tid. */ static int send_get_tcb(struct adapter *sc, u_int tid) { struct cpl_get_tcb *cpl; struct wrq_cookie cookie; MPASS(tid >= sc->tids.tid_base); MPASS(tid - sc->tids.tid_base < sc->tids.ntids); cpl = start_wrq_wr(&sc->sge.ctrlq[0], howmany(sizeof(*cpl), 16), &cookie); if (__predict_false(cpl == NULL)) return (ENOMEM); bzero(cpl, sizeof(*cpl)); INIT_TP_WR(cpl, tid); OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_GET_TCB, tid)); - cpl->reply_ctrl = htobe16(V_REPLY_CHAN(0) | - V_QUEUENO(sc->sge.ofld_rxq[0].iq.cntxt_id)); + if (chip_id(sc) >= CHELSIO_T7) { + cpl->reply_ctrl = + htobe16(V_T7_QUEUENO(sc->sge.ofld_rxq[0].iq.cntxt_id) | + V_T7_REPLY_CHAN(0) | V_NO_REPLY(0)); + } else { + cpl->reply_ctrl = + htobe16(V_QUEUENO(sc->sge.ofld_rxq[0].iq.cntxt_id) | + V_REPLY_CHAN(0) | V_NO_REPLY(0)); + } cpl->cookie = 0xff; commit_wrq_wr(&sc->sge.ctrlq[0], cpl, &cookie); return (0); } static struct tcb_histent * alloc_tcb_histent(struct adapter *sc, u_int tid, int flags) { struct tcb_histent *te; MPASS(flags == M_NOWAIT || flags == M_WAITOK); te = malloc(sizeof(*te), M_CXGBE, M_ZERO | flags); if (te == NULL) return (NULL); mtx_init(&te->te_lock, "TCB entry", NULL, MTX_DEF); callout_init_mtx(&te->te_callout, &te->te_lock, 0); te->te_adapter = sc; te->te_tid = tid; return (te); } static void free_tcb_histent(struct tcb_histent *te) { mtx_destroy(&te->te_lock); free(te, M_CXGBE); } /* * Start tracking the tid in the TCB history. */ int add_tid_to_history(struct adapter *sc, u_int tid) { struct tcb_histent *te = NULL; struct tom_data *td = sc->tom_softc; int rc; MPASS(tid >= sc->tids.tid_base); MPASS(tid - sc->tids.tid_base < sc->tids.ntids); if (td->tcb_history == NULL) return (ENXIO); rw_wlock(&td->tcb_history_lock); if (td->tcb_history[tid] != NULL) { rc = EEXIST; goto done; } te = alloc_tcb_histent(sc, tid, M_NOWAIT); if (te == NULL) { rc = ENOMEM; goto done; } mtx_lock(&te->te_lock); rc = send_get_tcb(sc, tid); if (rc == 0) { te->te_flags |= TE_RPL_PENDING; td->tcb_history[tid] = te; } else { free(te, M_CXGBE); } mtx_unlock(&te->te_lock); done: rw_wunlock(&td->tcb_history_lock); return (rc); } static void remove_tcb_histent(struct tcb_histent *te) { struct adapter *sc = te->te_adapter; struct tom_data *td = sc->tom_softc; rw_assert(&td->tcb_history_lock, RA_WLOCKED); mtx_assert(&te->te_lock, MA_OWNED); MPASS(td->tcb_history[te->te_tid] == te); td->tcb_history[te->te_tid] = NULL; free_tcb_histent(te); rw_wunlock(&td->tcb_history_lock); } static inline struct tcb_histent * lookup_tcb_histent(struct adapter *sc, u_int tid, bool addrem) { struct tcb_histent *te; struct tom_data *td = sc->tom_softc; MPASS(tid >= sc->tids.tid_base); MPASS(tid - sc->tids.tid_base < sc->tids.ntids); if (td->tcb_history == NULL) return (NULL); if (addrem) rw_wlock(&td->tcb_history_lock); else rw_rlock(&td->tcb_history_lock); te = td->tcb_history[tid]; if (te != NULL) { mtx_lock(&te->te_lock); return (te); /* with both locks held */ } if (addrem) rw_wunlock(&td->tcb_history_lock); else rw_runlock(&td->tcb_history_lock); return (te); } static inline void release_tcb_histent(struct tcb_histent *te) { struct adapter *sc = te->te_adapter; struct tom_data *td = sc->tom_softc; mtx_assert(&te->te_lock, MA_OWNED); mtx_unlock(&te->te_lock); rw_assert(&td->tcb_history_lock, RA_RLOCKED); rw_runlock(&td->tcb_history_lock); } static void request_tcb(void *arg) { struct tcb_histent *te = arg; mtx_assert(&te->te_lock, MA_OWNED); /* Noone else is supposed to update the histent. */ MPASS(!(te->te_flags & TE_RPL_PENDING)); if (send_get_tcb(te->te_adapter, te->te_tid) == 0) te->te_flags |= TE_RPL_PENDING; else callout_schedule(&te->te_callout, hz / 100); } static void update_tcb_histent(struct tcb_histent *te, const uint64_t *tcb) { struct tom_data *td = te->te_adapter->tom_softc; uint64_t tflags = get_tcb_tflags(tcb); uint8_t sample = 0; if (GET_TCB_FIELD(tcb, SND_MAX_RAW) != GET_TCB_FIELD(tcb, SND_UNA_RAW)) { if (GET_TCB_FIELD(tcb, T_RXTSHIFT) != 0) sample |= TS_RTO; if (GET_TCB_FIELD(tcb, T_DUPACKS) != 0) sample |= TS_DUPACKS; if (GET_TCB_FIELD(tcb, T_DUPACKS) >= td->dupack_threshold) sample |= TS_FASTREXMT; } if (GET_TCB_FIELD(tcb, SND_MAX_RAW) != 0) { uint32_t snd_wnd; sample |= TS_SND_BACKLOGGED; /* for whatever reason. */ snd_wnd = GET_TCB_FIELD(tcb, RCV_ADV); if (tflags & V_TF_RECV_SCALE(1)) snd_wnd <<= GET_TCB_FIELD(tcb, RCV_SCALE); if (GET_TCB_FIELD(tcb, SND_CWND) < snd_wnd) sample |= TS_CWND_LIMITED; /* maybe due to CWND */ } if (tflags & V_TF_CCTRL_ECN(1)) { /* * CE marker on incoming IP hdr, echoing ECE back in the TCP * hdr. Indicates congestion somewhere on the way from the peer * to this node. */ if (tflags & V_TF_CCTRL_ECE(1)) sample |= TS_ECN_ECE; /* * ECE seen and CWR sent (or about to be sent). Might indicate * congestion on the way to the peer. This node is reducing its * congestion window in response. */ if (tflags & (V_TF_CCTRL_CWR(1) | V_TF_CCTRL_RFR(1))) sample |= TS_ECN_CWR; } te->te_sample[te->te_pidx] = sample; if (++te->te_pidx == nitems(te->te_sample)) te->te_pidx = 0; memcpy(te->te_tcb, tcb, TCB_SIZE); te->te_flags |= TE_ACTIVE; } static int do_get_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_get_tcb_rpl *cpl = mtod(m, const void *); const uint64_t *tcb = (const uint64_t *)(const void *)(cpl + 1); struct tcb_histent *te; const u_int tid = GET_TID(cpl); bool remove; remove = GET_TCB_FIELD(tcb, T_STATE) == TCPS_CLOSED; te = lookup_tcb_histent(sc, tid, remove); if (te == NULL) { /* Not in the history. Who issued the GET_TCB for this? */ device_printf(sc->dev, "tcb %u: flags 0x%016jx, state %u, " "srtt %u, sscale %u, rscale %u, cookie 0x%x\n", tid, (uintmax_t)get_tcb_tflags(tcb), GET_TCB_FIELD(tcb, T_STATE), GET_TCB_FIELD(tcb, T_SRTT), GET_TCB_FIELD(tcb, SND_SCALE), GET_TCB_FIELD(tcb, RCV_SCALE), cpl->cookie); goto done; } MPASS(te->te_flags & TE_RPL_PENDING); te->te_flags &= ~TE_RPL_PENDING; if (remove) { remove_tcb_histent(te); } else { update_tcb_histent(te, tcb); callout_reset(&te->te_callout, hz / 10, request_tcb, te); release_tcb_histent(te); } done: m_freem(m); return (0); } static void fill_tcp_info_from_tcb(struct adapter *sc, uint64_t *tcb, struct tcp_info *ti) { uint32_t v; ti->tcpi_state = GET_TCB_FIELD(tcb, T_STATE); v = GET_TCB_FIELD(tcb, T_SRTT); ti->tcpi_rtt = tcp_ticks_to_us(sc, v); v = GET_TCB_FIELD(tcb, T_RTTVAR); ti->tcpi_rttvar = tcp_ticks_to_us(sc, v); ti->tcpi_snd_ssthresh = GET_TCB_FIELD(tcb, SND_SSTHRESH); ti->tcpi_snd_cwnd = GET_TCB_FIELD(tcb, SND_CWND); ti->tcpi_rcv_nxt = GET_TCB_FIELD(tcb, RCV_NXT); ti->tcpi_rcv_adv = GET_TCB_FIELD(tcb, RCV_ADV); ti->tcpi_dupacks = GET_TCB_FIELD(tcb, T_DUPACKS); v = GET_TCB_FIELD(tcb, TX_MAX); ti->tcpi_snd_nxt = v - GET_TCB_FIELD(tcb, SND_NXT_RAW); ti->tcpi_snd_una = v - GET_TCB_FIELD(tcb, SND_UNA_RAW); ti->tcpi_snd_max = v - GET_TCB_FIELD(tcb, SND_MAX_RAW); /* Receive window being advertised by us. */ ti->tcpi_rcv_wscale = GET_TCB_FIELD(tcb, SND_SCALE); /* Yes, SND. */ ti->tcpi_rcv_space = GET_TCB_FIELD(tcb, RCV_WND); /* Send window */ ti->tcpi_snd_wscale = GET_TCB_FIELD(tcb, RCV_SCALE); /* Yes, RCV. */ ti->tcpi_snd_wnd = GET_TCB_FIELD(tcb, RCV_ADV); if (get_tcb_tflags(tcb) & V_TF_RECV_SCALE(1)) ti->tcpi_snd_wnd <<= ti->tcpi_snd_wscale; else ti->tcpi_snd_wscale = 0; } static void fill_tcp_info_from_history(struct adapter *sc, struct tcb_histent *te, struct tcp_info *ti) { fill_tcp_info_from_tcb(sc, te->te_tcb, ti); } /* * Reads the TCB for the given tid using a memory window and copies it to 'buf' * in the same format as CPL_GET_TCB_RPL. */ static void read_tcb_using_memwin(struct adapter *sc, u_int tid, uint64_t *buf) { int i, j, k, rc; uint32_t addr; u_char *tcb, tmp; MPASS(tid >= sc->tids.tid_base); MPASS(tid - sc->tids.tid_base < sc->tids.ntids); addr = t4_read_reg(sc, A_TP_CMM_TCB_BASE) + tid * TCB_SIZE; rc = read_via_memwin(sc, 2, addr, (uint32_t *)buf, TCB_SIZE); if (rc != 0) return; tcb = (u_char *)buf; for (i = 0, j = TCB_SIZE - 16; i < j; i += 16, j -= 16) { for (k = 0; k < 16; k++) { tmp = tcb[i + k]; tcb[i + k] = tcb[j + k]; tcb[j + k] = tmp; } } } static void fill_tcp_info(struct adapter *sc, u_int tid, struct tcp_info *ti) { uint64_t tcb[TCB_SIZE / sizeof(uint64_t)]; struct tcb_histent *te; ti->tcpi_toe_tid = tid; te = lookup_tcb_histent(sc, tid, false); if (te != NULL) { fill_tcp_info_from_history(sc, te, ti); release_tcb_histent(te); } else { if (!(sc->debug_flags & DF_DISABLE_TCB_CACHE)) { /* XXX: tell firmware to flush TCB cache. */ } read_tcb_using_memwin(sc, tid, tcb); fill_tcp_info_from_tcb(sc, tcb, ti); } } /* * Called by the kernel to allow the TOE driver to "refine" values filled up in * the tcp_info for an offloaded connection. */ static void t4_tcp_info(struct toedev *tod, const struct tcpcb *tp, struct tcp_info *ti) { struct adapter *sc = tod->tod_softc; struct toepcb *toep = tp->t_toe; INP_LOCK_ASSERT(tptoinpcb(tp)); MPASS(ti != NULL); fill_tcp_info(sc, toep->tid, ti); } #ifdef KERN_TLS static int t4_alloc_tls_session(struct toedev *tod, struct tcpcb *tp, struct ktls_session *tls, int direction) { struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(tptoinpcb(tp)); MPASS(tls != NULL); return (tls_alloc_ktls(toep, tls, direction)); } #endif static void send_mss_flowc_wr(struct adapter *sc, struct toepcb *toep) { struct wrq_cookie cookie; struct fw_flowc_wr *flowc; struct ofld_tx_sdesc *txsd; const int flowclen = sizeof(*flowc) + sizeof(struct fw_flowc_mnemval); const int flowclen16 = howmany(flowclen, 16); if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0) { CH_ERR(sc, "%s: tid %u out of tx credits (%d, %d).\n", __func__, toep->tid, toep->tx_credits, toep->txsd_avail); return; } flowc = start_wrq_wr(&toep->ofld_txq->wrq, flowclen16, &cookie); if (__predict_false(flowc == NULL)) { CH_ERR(sc, "ENOMEM in %s for tid %u.\n", __func__, toep->tid); return; } flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | V_FW_FLOWC_WR_NPARAMS(1)); flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) | V_FW_WR_FLOWID(toep->tid)); flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_MSS; flowc->mnemval[0].val = htobe32(toep->params.emss); txsd = &toep->txsd[toep->txsd_pidx]; _Static_assert(flowclen16 <= MAX_OFLD_TX_SDESC_CREDITS, "MAX_OFLD_TX_SDESC_CREDITS too small"); txsd->tx_credits = flowclen16; txsd->plen = 0; toep->tx_credits -= txsd->tx_credits; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) toep->txsd_pidx = 0; toep->txsd_avail--; commit_wrq_wr(&toep->ofld_txq->wrq, flowc, &cookie); } static void t4_pmtu_update(struct toedev *tod, struct tcpcb *tp, tcp_seq seq, int mtu) { struct work_request_hdr *wrh; struct ulp_txpkt *ulpmc; int idx, len; struct wrq_cookie cookie; struct inpcb *inp = tptoinpcb(tp); struct toepcb *toep = tp->t_toe; struct adapter *sc = td_adapter(toep->td); unsigned short *mtus = &sc->params.mtus[0]; INP_WLOCK_ASSERT(inp); MPASS(mtu > 0); /* kernel is supposed to provide something usable. */ /* tp->snd_una and snd_max are in host byte order too. */ seq = be32toh(seq); CTR6(KTR_CXGBE, "%s: tid %d, seq 0x%08x, mtu %u, mtu_idx %u (%d)", __func__, toep->tid, seq, mtu, toep->params.mtu_idx, mtus[toep->params.mtu_idx]); if (ulp_mode(toep) == ULP_MODE_NONE && /* XXX: Read TCB otherwise? */ (SEQ_LT(seq, tp->snd_una) || SEQ_GEQ(seq, tp->snd_max))) { CTR5(KTR_CXGBE, "%s: tid %d, seq 0x%08x not in range [0x%08x, 0x%08x).", __func__, toep->tid, seq, tp->snd_una, tp->snd_max); return; } /* Find the best mtu_idx for the suggested MTU. */ for (idx = 0; idx < NMTUS - 1 && mtus[idx + 1] <= mtu; idx++) continue; if (idx >= toep->params.mtu_idx) return; /* Never increase the PMTU (just like the kernel). */ /* * We'll send a compound work request with 2 SET_TCB_FIELDs -- the first * one updates the mtu_idx and the second one triggers a retransmit. */ len = sizeof(*wrh) + 2 * roundup2(LEN__SET_TCB_FIELD_ULP, 16); wrh = start_wrq_wr(toep->ctrlq, howmany(len, 16), &cookie); if (wrh == NULL) { CH_ERR(sc, "failed to change mtu_idx of tid %d (%u -> %u).\n", toep->tid, toep->params.mtu_idx, idx); return; } INIT_ULPTX_WRH(wrh, len, 1, 0); /* atomic */ ulpmc = (struct ulp_txpkt *)(wrh + 1); ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_T_MAXSEG, V_TCB_T_MAXSEG(M_TCB_T_MAXSEG), V_TCB_T_MAXSEG(idx)); ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_TIMESTAMP, V_TCB_TIMESTAMP(0x7FFFFULL << 11), 0); commit_wrq_wr(toep->ctrlq, wrh, &cookie); /* Update the software toepcb and tcpcb. */ toep->params.mtu_idx = idx; tp->t_maxseg = mtus[toep->params.mtu_idx]; if (inp->inp_inc.inc_flags & INC_ISIPV6) tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr); toep->params.emss = tp->t_maxseg; if (tp->t_flags & TF_RCVD_TSTMP) toep->params.emss -= TCPOLEN_TSTAMP_APPA; /* Update the firmware flowc. */ send_mss_flowc_wr(sc, toep); /* Update the MTU in the kernel's hostcache. */ if (sc->tt.update_hc_on_pmtu_change != 0) { struct in_conninfo inc = {0}; inc.inc_fibnum = inp->inp_inc.inc_fibnum; if (inp->inp_inc.inc_flags & INC_ISIPV6) { inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = inp->inp_inc.inc6_faddr; } else { inc.inc_faddr = inp->inp_inc.inc_faddr; } tcp_hc_updatemtu(&inc, mtu); } CTR6(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u), t_maxseg %u, emss %u", __func__, toep->tid, toep->params.mtu_idx, mtus[toep->params.mtu_idx], tp->t_maxseg, toep->params.emss); } /* * The TOE driver will not receive any more CPLs for the tid associated with the * toepcb; release the hold on the inpcb. */ void final_cpl_received(struct toepcb *toep) { struct inpcb *inp = toep->inp; bool need_wakeup; KASSERT(inp != NULL, ("%s: inp is NULL", __func__)); INP_WLOCK_ASSERT(inp); KASSERT(toep->flags & TPF_CPL_PENDING, ("%s: CPL not pending already?", __func__)); CTR6(KTR_CXGBE, "%s: tid %d, toep %p (0x%x), inp %p (0x%x)", __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags); if (ulp_mode(toep) == ULP_MODE_TCPDDP) release_ddp_resources(toep); toep->inp = NULL; need_wakeup = (toep->flags & TPF_WAITING_FOR_FINAL) != 0; toep->flags &= ~(TPF_CPL_PENDING | TPF_WAITING_FOR_FINAL); mbufq_drain(&toep->ulp_pduq); mbufq_drain(&toep->ulp_pdu_reclaimq); release_offload_resources(toep); if (!(toep->flags & TPF_ATTACHED)) done_with_toepcb(toep); if (!in_pcbrele_wlocked(inp)) INP_WUNLOCK(inp); if (need_wakeup) { struct mtx *lock = mtx_pool_find(mtxpool_sleep, toep); mtx_lock(lock); wakeup(toep); mtx_unlock(lock); } } void insert_tid(struct adapter *sc, int tid, void *ctx, int ntids) { struct tid_info *t = &sc->tids; MPASS(tid >= t->tid_base); MPASS(tid - t->tid_base < t->ntids); t->tid_tab[tid - t->tid_base] = ctx; atomic_add_int(&t->tids_in_use, ntids); } void * lookup_tid(struct adapter *sc, int tid) { struct tid_info *t = &sc->tids; return (t->tid_tab[tid - t->tid_base]); } void update_tid(struct adapter *sc, int tid, void *ctx) { struct tid_info *t = &sc->tids; t->tid_tab[tid - t->tid_base] = ctx; } void remove_tid(struct adapter *sc, int tid, int ntids) { struct tid_info *t = &sc->tids; t->tid_tab[tid - t->tid_base] = NULL; atomic_subtract_int(&t->tids_in_use, ntids); } /* * What mtu_idx to use, given a 4-tuple. Note that both s->mss and tcp_mssopt * have the MSS that we should advertise in our SYN. Advertised MSS doesn't * account for any TCP options so the effective MSS (only payload, no headers or * options) could be different. */ static int find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, struct offload_settings *s) { unsigned short *mtus = &sc->params.mtus[0]; int i, mss, mtu; MPASS(inc != NULL); mss = s->mss > 0 ? s->mss : tcp_mssopt(inc); if (inc->inc_flags & INC_ISIPV6) mtu = mss + sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else mtu = mss + sizeof(struct ip) + sizeof(struct tcphdr); for (i = 0; i < NMTUS - 1 && mtus[i + 1] <= mtu; i++) continue; return (i); } /* * Determine the receive window size for a socket. */ u_long select_rcv_wnd(struct socket *so) { unsigned long wnd; SOCKBUF_LOCK_ASSERT(&so->so_rcv); wnd = sbspace(&so->so_rcv); if (wnd < MIN_RCV_WND) wnd = MIN_RCV_WND; return min(wnd, MAX_RCV_WND); } int select_rcv_wscale(void) { int wscale = 0; unsigned long space = sb_max; if (space > MAX_RCV_WND) space = MAX_RCV_WND; while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space) wscale++; return (wscale); } __be64 calc_options0(struct vi_info *vi, struct conn_params *cp) { uint64_t opt0 = 0; opt0 |= F_TCAM_BYPASS; MPASS(cp->wscale >= 0 && cp->wscale <= M_WND_SCALE); opt0 |= V_WND_SCALE(cp->wscale); MPASS(cp->mtu_idx >= 0 && cp->mtu_idx < NMTUS); opt0 |= V_MSS_IDX(cp->mtu_idx); MPASS(cp->ulp_mode >= 0 && cp->ulp_mode <= M_ULP_MODE); opt0 |= V_ULP_MODE(cp->ulp_mode); MPASS(cp->opt0_bufsize >= 0 && cp->opt0_bufsize <= M_RCV_BUFSIZ); opt0 |= V_RCV_BUFSIZ(cp->opt0_bufsize); MPASS(cp->l2t_idx >= 0 && cp->l2t_idx < vi->adapter->vres.l2t.size); opt0 |= V_L2T_IDX(cp->l2t_idx); opt0 |= V_SMAC_SEL(vi->smt_idx); opt0 |= V_TX_CHAN(vi->pi->tx_chan); MPASS(cp->keepalive == 0 || cp->keepalive == 1); opt0 |= V_KEEP_ALIVE(cp->keepalive); MPASS(cp->nagle == 0 || cp->nagle == 1); opt0 |= V_NAGLE(cp->nagle); return (htobe64(opt0)); } __be32 calc_options2(struct vi_info *vi, struct conn_params *cp) { uint32_t opt2 = 0; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; /* * rx flow control, rx coalesce, congestion control, and tx pace are all * explicitly set by the driver. On T5+ the ISS is also set by the * driver to the value picked by the kernel. */ if (is_t4(sc)) { opt2 |= F_RX_FC_VALID | F_RX_COALESCE_VALID; opt2 |= F_CONG_CNTRL_VALID | F_PACE_VALID; } else { opt2 |= F_T5_OPT_2_VALID; /* all 4 valid */ opt2 |= F_T5_ISS; /* ISS provided in CPL */ } MPASS(cp->sack == 0 || cp->sack == 1); opt2 |= V_SACK_EN(cp->sack); MPASS(cp->tstamp == 0 || cp->tstamp == 1); opt2 |= V_TSTAMPS_EN(cp->tstamp); if (cp->wscale > 0) opt2 |= F_WND_SCALE_EN; MPASS(cp->ecn == 0 || cp->ecn == 1); opt2 |= V_CCTRL_ECN(cp->ecn); opt2 |= V_TX_QUEUE(TX_MODQ(pi->tx_chan)); opt2 |= V_PACE(0); opt2 |= F_RSS_QUEUE_VALID; opt2 |= V_RSS_QUEUE(sc->sge.ofld_rxq[cp->rxq_idx].iq.abs_id); if (chip_id(sc) <= CHELSIO_T6) { MPASS(pi->rx_chan == 0 || pi->rx_chan == 1); opt2 |= V_RX_CHANNEL(pi->rx_chan); } MPASS(cp->cong_algo >= 0 && cp->cong_algo <= M_CONG_CNTRL); opt2 |= V_CONG_CNTRL(cp->cong_algo); MPASS(cp->rx_coalesce == 0 || cp->rx_coalesce == 1); if (cp->rx_coalesce == 1) opt2 |= V_RX_COALESCE(M_RX_COALESCE); opt2 |= V_RX_FC_DDP(0) | V_RX_FC_DISABLE(0); MPASS(cp->ulp_mode != ULP_MODE_TCPDDP); return (htobe32(opt2)); } uint64_t select_ntuple(struct vi_info *vi, struct l2t_entry *e) { struct adapter *sc = vi->adapter; struct tp_params *tp = &sc->params.tp; uint64_t ntuple = 0; /* * Initialize each of the fields which we care about which are present * in the Compressed Filter Tuple. */ if (tp->vlan_shift >= 0 && EVL_VLANOFTAG(e->vlan) != CPL_L2T_VLAN_NONE) ntuple |= (uint64_t)(F_FT_VLAN_VLD | e->vlan) << tp->vlan_shift; if (tp->port_shift >= 0) ntuple |= (uint64_t)e->hw_port << tp->port_shift; if (tp->protocol_shift >= 0) ntuple |= (uint64_t)IPPROTO_TCP << tp->protocol_shift; if (tp->vnic_shift >= 0 && tp->vnic_mode == FW_VNIC_MODE_PF_VF) { ntuple |= (uint64_t)(V_FT_VNID_ID_VF(vi->vin) | V_FT_VNID_ID_PF(sc->pf) | V_FT_VNID_ID_VLD(vi->vfvld)) << tp->vnic_shift; } return (ntuple); } /* * Initialize various connection parameters. */ void init_conn_params(struct vi_info *vi , struct offload_settings *s, struct in_conninfo *inc, struct socket *so, const struct tcp_options *tcpopt, int16_t l2t_idx, struct conn_params *cp) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct tom_tunables *tt = &sc->tt; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); u_long wnd; u_int q_idx; MPASS(s->offload != 0); /* Congestion control algorithm */ if (s->cong_algo >= 0) cp->cong_algo = s->cong_algo & M_CONG_CNTRL; else if (sc->tt.cong_algorithm >= 0) cp->cong_algo = tt->cong_algorithm & M_CONG_CNTRL; else { struct cc_algo *cc = CC_ALGO(tp); if (strcasecmp(cc->name, "reno") == 0) cp->cong_algo = CONG_ALG_RENO; else if (strcasecmp(cc->name, "tahoe") == 0) cp->cong_algo = CONG_ALG_TAHOE; if (strcasecmp(cc->name, "newreno") == 0) cp->cong_algo = CONG_ALG_NEWRENO; if (strcasecmp(cc->name, "highspeed") == 0) cp->cong_algo = CONG_ALG_HIGHSPEED; else { /* * Use newreno in case the algorithm selected by the * host stack is not supported by the hardware. */ cp->cong_algo = CONG_ALG_NEWRENO; } } /* Tx traffic scheduling class. */ if (s->sched_class >= 0 && s->sched_class < sc->params.nsched_cls) cp->tc_idx = s->sched_class; else cp->tc_idx = -1; /* Nagle's algorithm. */ if (s->nagle >= 0) cp->nagle = s->nagle > 0 ? 1 : 0; else cp->nagle = tp->t_flags & TF_NODELAY ? 0 : 1; /* TCP Keepalive. */ if (V_tcp_always_keepalive || so_options_get(so) & SO_KEEPALIVE) cp->keepalive = 1; else cp->keepalive = 0; /* Optimization that's specific to T5 @ 40G. */ if (tt->tx_align >= 0) cp->tx_align = tt->tx_align > 0 ? 1 : 0; else if (chip_id(sc) == CHELSIO_T5 && (port_top_speed(pi) > 10 || sc->params.nports > 2)) cp->tx_align = 1; else cp->tx_align = 0; /* ULP mode. */ cp->ulp_mode = ULP_MODE_NONE; /* Rx coalescing. */ if (s->rx_coalesce >= 0) cp->rx_coalesce = s->rx_coalesce > 0 ? 1 : 0; else if (tt->rx_coalesce >= 0) cp->rx_coalesce = tt->rx_coalesce > 0 ? 1 : 0; else cp->rx_coalesce = 1; /* default */ /* * Index in the PMTU table. This controls the MSS that we announce in * our SYN initially, but after ESTABLISHED it controls the MSS that we * use to send data. */ cp->mtu_idx = find_best_mtu_idx(sc, inc, s); /* Control queue. */ cp->ctrlq_idx = vi->pi->port_id; /* Tx queue for this connection. */ if (s->txq == QUEUE_RANDOM) q_idx = arc4random(); else if (s->txq == QUEUE_ROUNDROBIN) q_idx = atomic_fetchadd_int(&vi->txq_rr, 1); else q_idx = s->txq; cp->txq_idx = vi->first_ofld_txq + q_idx % vi->nofldtxq; /* Rx queue for this connection. */ if (s->rxq == QUEUE_RANDOM) q_idx = arc4random(); else if (s->rxq == QUEUE_ROUNDROBIN) q_idx = atomic_fetchadd_int(&vi->rxq_rr, 1); else q_idx = s->rxq; cp->rxq_idx = vi->first_ofld_rxq + q_idx % vi->nofldrxq; if (SOLISTENING(so)) { /* Passive open */ MPASS(tcpopt != NULL); /* TCP timestamp option */ if (tcpopt->tstamp && (s->tstamp > 0 || (s->tstamp < 0 && V_tcp_do_rfc1323))) cp->tstamp = 1; else cp->tstamp = 0; /* SACK */ if (tcpopt->sack && (s->sack > 0 || (s->sack < 0 && V_tcp_do_sack))) cp->sack = 1; else cp->sack = 0; /* Receive window scaling. */ if (tcpopt->wsf > 0 && tcpopt->wsf < 15 && V_tcp_do_rfc1323) cp->wscale = select_rcv_wscale(); else cp->wscale = 0; /* ECN */ if (tcpopt->ecn && /* XXX: review. */ (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn))) cp->ecn = 1; else cp->ecn = 0; wnd = max(so->sol_sbrcv_hiwat, MIN_RCV_WND); cp->opt0_bufsize = min(wnd >> 10, M_RCV_BUFSIZ); if (tt->sndbuf > 0) cp->sndbuf = tt->sndbuf; else if (so->sol_sbsnd_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf) cp->sndbuf = 256 * 1024; else cp->sndbuf = so->sol_sbsnd_hiwat; } else { /* Active open */ /* TCP timestamp option */ if (s->tstamp > 0 || (s->tstamp < 0 && (tp->t_flags & TF_REQ_TSTMP))) cp->tstamp = 1; else cp->tstamp = 0; /* SACK */ if (s->sack > 0 || (s->sack < 0 && (tp->t_flags & TF_SACK_PERMIT))) cp->sack = 1; else cp->sack = 0; /* Receive window scaling */ if (tp->t_flags & TF_REQ_SCALE) cp->wscale = select_rcv_wscale(); else cp->wscale = 0; /* ECN */ if (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn == 1)) cp->ecn = 1; else cp->ecn = 0; SOCKBUF_LOCK(&so->so_rcv); wnd = max(select_rcv_wnd(so), MIN_RCV_WND); SOCKBUF_UNLOCK(&so->so_rcv); cp->opt0_bufsize = min(wnd >> 10, M_RCV_BUFSIZ); if (tt->sndbuf > 0) cp->sndbuf = tt->sndbuf; else { SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf) cp->sndbuf = 256 * 1024; else cp->sndbuf = so->so_snd.sb_hiwat; SOCKBUF_UNLOCK(&so->so_snd); } } cp->l2t_idx = l2t_idx; /* This will be initialized on ESTABLISHED. */ cp->emss = 0; } void update_tid_qid_sel(struct vi_info *vi, struct conn_params *cp, int tid) { struct adapter *sc = vi->adapter; const int mask = sc->params.tid_qid_sel_mask; struct sge_ofld_txq *ofld_txq = &sc->sge.ofld_txq[cp->txq_idx]; uint32_t ngroup; int g, nqpg; cp->ctrlq_idx = ofld_txq_group(tid, mask); CTR(KTR_CXGBE, "tid %u is on core %u", tid, cp->ctrlq_idx); if ((ofld_txq->wrq.eq.cntxt_id & mask) == (tid & mask)) return; ngroup = 1 << bitcount32(mask); MPASS(vi->nofldtxq % ngroup == 0); g = ofld_txq_group(tid, mask); nqpg = vi->nofldtxq / ngroup; cp->txq_idx = vi->first_ofld_txq + g * nqpg + arc4random() % nqpg; #ifdef INVARIANTS MPASS(cp->txq_idx < vi->first_ofld_txq + vi->nofldtxq); ofld_txq = &sc->sge.ofld_txq[cp->txq_idx]; MPASS((ofld_txq->wrq.eq.cntxt_id & mask) == (tid & mask)); #endif } int negative_advice(int status) { return (status == CPL_ERR_RTX_NEG_ADVICE || status == CPL_ERR_PERSIST_NEG_ADVICE || status == CPL_ERR_KEEPALV_NEG_ADVICE); } static int alloc_tid_tab(struct adapter *sc) { struct tid_info *t = &sc->tids; MPASS(t->ntids > 0); MPASS(t->tid_tab == NULL); t->tid_tab = malloc(t->ntids * sizeof(*t->tid_tab), M_CXGBE, M_ZERO | M_NOWAIT); if (t->tid_tab == NULL) return (ENOMEM); atomic_store_rel_int(&t->tids_in_use, 0); return (0); } static void free_tid_tab(struct adapter *sc) { struct tid_info *t = &sc->tids; KASSERT(t->tids_in_use == 0, ("%s: %d tids still in use.", __func__, t->tids_in_use)); free(t->tid_tab, M_CXGBE); t->tid_tab = NULL; } static void free_tid_tabs(struct adapter *sc) { free_tid_tab(sc); free_stid_tab(sc); } static int alloc_tid_tabs(struct adapter *sc) { int rc; rc = alloc_tid_tab(sc); if (rc != 0) goto failed; rc = alloc_stid_tab(sc); if (rc != 0) goto failed; return (0); failed: free_tid_tabs(sc); return (rc); } static inline void alloc_tcb_history(struct adapter *sc, struct tom_data *td) { if (sc->tids.ntids == 0 || sc->tids.ntids > 1024) return; rw_init(&td->tcb_history_lock, "TCB history"); td->tcb_history = malloc(sc->tids.ntids * sizeof(*td->tcb_history), M_CXGBE, M_ZERO | M_NOWAIT); td->dupack_threshold = G_DUPACKTHRESH(t4_read_reg(sc, A_TP_PARA_REG0)); } static inline void free_tcb_history(struct adapter *sc, struct tom_data *td) { #ifdef INVARIANTS int i; if (td->tcb_history != NULL) { for (i = 0; i < sc->tids.ntids; i++) { MPASS(td->tcb_history[i] == NULL); } } #endif free(td->tcb_history, M_CXGBE); if (rw_initialized(&td->tcb_history_lock)) rw_destroy(&td->tcb_history_lock); } static void free_tom_data(struct adapter *sc, struct tom_data *td) { ASSERT_SYNCHRONIZED_OP(sc); KASSERT(TAILQ_EMPTY(&td->toep_list), ("%s: TOE PCB list is not empty.", __func__)); KASSERT(td->lctx_count == 0, ("%s: lctx hash table is not empty.", __func__)); t4_free_ppod_region(&td->pr); if (td->listen_mask != 0) hashdestroy(td->listen_hash, M_CXGBE, td->listen_mask); if (mtx_initialized(&td->unsent_wr_lock)) mtx_destroy(&td->unsent_wr_lock); if (mtx_initialized(&td->lctx_hash_lock)) mtx_destroy(&td->lctx_hash_lock); if (mtx_initialized(&td->toep_list_lock)) mtx_destroy(&td->toep_list_lock); free_tcb_history(sc, td); free_tid_tabs(sc); free(td, M_CXGBE); } static char * prepare_pkt(int open_type, uint16_t vtag, struct inpcb *inp, int *pktlen, int *buflen) { char *pkt; struct tcphdr *th; int ipv6, len; const int maxlen = max(sizeof(struct ether_header), sizeof(struct ether_vlan_header)) + max(sizeof(struct ip), sizeof(struct ip6_hdr)) + sizeof(struct tcphdr); MPASS(open_type == OPEN_TYPE_ACTIVE || open_type == OPEN_TYPE_LISTEN); pkt = malloc(maxlen, M_CXGBE, M_ZERO | M_NOWAIT); if (pkt == NULL) return (NULL); ipv6 = inp->inp_vflag & INP_IPV6; len = 0; if (EVL_VLANOFTAG(vtag) == 0xfff) { struct ether_header *eh = (void *)pkt; if (ipv6) eh->ether_type = htons(ETHERTYPE_IPV6); else eh->ether_type = htons(ETHERTYPE_IP); len += sizeof(*eh); } else { struct ether_vlan_header *evh = (void *)pkt; evh->evl_encap_proto = htons(ETHERTYPE_VLAN); evh->evl_tag = htons(vtag); if (ipv6) evh->evl_proto = htons(ETHERTYPE_IPV6); else evh->evl_proto = htons(ETHERTYPE_IP); len += sizeof(*evh); } if (ipv6) { struct ip6_hdr *ip6 = (void *)&pkt[len]; ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_plen = htons(sizeof(struct tcphdr)); ip6->ip6_nxt = IPPROTO_TCP; if (open_type == OPEN_TYPE_ACTIVE) { ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = inp->in6p_faddr; } else if (open_type == OPEN_TYPE_LISTEN) { ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = ip6->ip6_src; } len += sizeof(*ip6); } else { struct ip *ip = (void *)&pkt[len]; ip->ip_v = IPVERSION; ip->ip_hl = sizeof(*ip) >> 2; ip->ip_tos = inp->inp_ip_tos; ip->ip_len = htons(sizeof(struct ip) + sizeof(struct tcphdr)); ip->ip_ttl = inp->inp_ip_ttl; ip->ip_p = IPPROTO_TCP; if (open_type == OPEN_TYPE_ACTIVE) { ip->ip_src = inp->inp_laddr; ip->ip_dst = inp->inp_faddr; } else if (open_type == OPEN_TYPE_LISTEN) { ip->ip_src = inp->inp_laddr; ip->ip_dst = ip->ip_src; } len += sizeof(*ip); } th = (void *)&pkt[len]; if (open_type == OPEN_TYPE_ACTIVE) { th->th_sport = inp->inp_lport; /* network byte order already */ th->th_dport = inp->inp_fport; /* ditto */ } else if (open_type == OPEN_TYPE_LISTEN) { th->th_sport = inp->inp_lport; /* network byte order already */ th->th_dport = th->th_sport; } len += sizeof(th); *pktlen = *buflen = len; return (pkt); } const struct offload_settings * lookup_offload_policy(struct adapter *sc, int open_type, struct mbuf *m, uint16_t vtag, struct inpcb *inp) { const struct t4_offload_policy *op; char *pkt; struct offload_rule *r; int i, matched, pktlen, buflen; static const struct offload_settings allow_offloading_settings = { .offload = 1, .rx_coalesce = -1, .cong_algo = -1, .sched_class = -1, .tstamp = -1, .sack = -1, .nagle = -1, .ecn = -1, .ddp = -1, .tls = -1, .txq = QUEUE_RANDOM, .rxq = QUEUE_RANDOM, .mss = -1, }; static const struct offload_settings disallow_offloading_settings = { .offload = 0, /* rest is irrelevant when offload is off. */ }; rw_assert(&sc->policy_lock, RA_LOCKED); /* * If there's no Connection Offloading Policy attached to the device * then we need to return a default static policy. If * "cop_managed_offloading" is true, then we need to disallow * offloading until a COP is attached to the device. Otherwise we * allow offloading ... */ op = sc->policy; if (op == NULL) { if (sc->tt.cop_managed_offloading) return (&disallow_offloading_settings); else return (&allow_offloading_settings); } switch (open_type) { case OPEN_TYPE_ACTIVE: case OPEN_TYPE_LISTEN: pkt = prepare_pkt(open_type, vtag, inp, &pktlen, &buflen); break; case OPEN_TYPE_PASSIVE: MPASS(m != NULL); pkt = mtod(m, char *); MPASS(*pkt == CPL_PASS_ACCEPT_REQ); pkt += sizeof(struct cpl_pass_accept_req); pktlen = m->m_pkthdr.len - sizeof(struct cpl_pass_accept_req); buflen = m->m_len - sizeof(struct cpl_pass_accept_req); break; default: MPASS(0); return (&disallow_offloading_settings); } if (pkt == NULL || pktlen == 0 || buflen == 0) return (&disallow_offloading_settings); matched = 0; r = &op->rule[0]; for (i = 0; i < op->nrules; i++, r++) { if (r->open_type != open_type && r->open_type != OPEN_TYPE_DONTCARE) { continue; } matched = bpf_filter(r->bpf_prog.bf_insns, pkt, pktlen, buflen); if (matched) break; } if (open_type == OPEN_TYPE_ACTIVE || open_type == OPEN_TYPE_LISTEN) free(pkt, M_CXGBE); return (matched ? &r->settings : &disallow_offloading_settings); } static void reclaim_wr_resources(void *arg, int count) { struct tom_data *td = arg; STAILQ_HEAD(, wrqe) twr_list = STAILQ_HEAD_INITIALIZER(twr_list); struct cpl_act_open_req *cpl; u_int opcode, atid, tid; struct wrqe *wr; struct adapter *sc = td_adapter(td); mtx_lock(&td->unsent_wr_lock); STAILQ_SWAP(&td->unsent_wr_list, &twr_list, wrqe); mtx_unlock(&td->unsent_wr_lock); while ((wr = STAILQ_FIRST(&twr_list)) != NULL) { STAILQ_REMOVE_HEAD(&twr_list, link); cpl = wrtod(wr); opcode = GET_OPCODE(cpl); switch (opcode) { case CPL_ACT_OPEN_REQ: case CPL_ACT_OPEN_REQ6: atid = G_TID_TID(be32toh(OPCODE_TID(cpl))); CTR2(KTR_CXGBE, "%s: atid %u ", __func__, atid); act_open_failure_cleanup(sc, lookup_atid(sc, atid), EHOSTUNREACH); free(wr, M_CXGBE); break; case CPL_PASS_ACCEPT_RPL: tid = GET_TID(cpl); CTR2(KTR_CXGBE, "%s: tid %u ", __func__, tid); synack_failure_cleanup(sc, lookup_tid(sc, tid)); free(wr, M_CXGBE); break; default: log(LOG_ERR, "%s: leaked work request %p, wr_len %d, " "opcode %x\n", __func__, wr, wr->wr_len, opcode); /* WR not freed here; go look at it with a debugger. */ } } } /* * Based on do_abort_req. We treat an abrupt hardware stop as a connection * abort from the hardware. */ static void live_tid_failure_cleanup(struct adapter *sc, struct toepcb *toep, u_int status) { struct inpcb *inp; struct tcpcb *tp; struct epoch_tracker et; MPASS(!(toep->flags & TPF_SYNQE)); inp = toep->inp; CURVNET_SET(toep->vnet); NET_EPOCH_ENTER(et); /* for tcp_close */ INP_WLOCK(inp); tp = intotcpcb(inp); toep->flags |= TPF_ABORT_SHUTDOWN; if ((inp->inp_flags & INP_DROPPED) == 0) { struct socket *so = inp->inp_socket; if (so != NULL) so_error_set(so, status); tp = tcp_close(tp); if (tp == NULL) INP_WLOCK(inp); /* re-acquire */ } final_cpl_received(toep); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); } static void cleanup_stranded_tids(void *arg, int count) { TAILQ_HEAD(, toepcb) tlist = TAILQ_HEAD_INITIALIZER(tlist); TAILQ_HEAD(, synq_entry) slist = TAILQ_HEAD_INITIALIZER(slist); struct tom_data *td = arg; struct adapter *sc = td_adapter(td); struct toepcb *toep; struct synq_entry *synqe; /* Clean up synq entries. */ mtx_lock(&td->toep_list_lock); TAILQ_SWAP(&td->stranded_synqe, &slist, synq_entry, link); mtx_unlock(&td->toep_list_lock); while ((synqe = TAILQ_FIRST(&slist)) != NULL) { TAILQ_REMOVE(&slist, synqe, link); MPASS(synqe->tid >= 0); /* stale, was kept around for debug */ synqe->tid = -1; synack_failure_cleanup(sc, synqe); } /* Clean up in-flight active opens. */ mtx_lock(&td->toep_list_lock); TAILQ_SWAP(&td->stranded_atids, &tlist, toepcb, link); mtx_unlock(&td->toep_list_lock); while ((toep = TAILQ_FIRST(&tlist)) != NULL) { TAILQ_REMOVE(&tlist, toep, link); MPASS(toep->tid >= 0); /* stale, was kept around for debug */ toep->tid = -1; act_open_failure_cleanup(sc, toep, EHOSTUNREACH); } /* Clean up live connections. */ mtx_lock(&td->toep_list_lock); TAILQ_SWAP(&td->stranded_tids, &tlist, toepcb, link); mtx_unlock(&td->toep_list_lock); while ((toep = TAILQ_FIRST(&tlist)) != NULL) { TAILQ_REMOVE(&tlist, toep, link); MPASS(toep->tid >= 0); /* stale, was kept around for debug */ toep->tid = -1; live_tid_failure_cleanup(sc, toep, ECONNABORTED); } } /* * Ground control to Major TOM * Commencing countdown, engines on */ static int t4_tom_activate(struct adapter *sc) { struct tom_data *td; struct toedev *tod; struct vi_info *vi; int i, rc, v; ASSERT_SYNCHRONIZED_OP(sc); /* per-adapter softc for TOM */ td = malloc(sizeof(*td), M_CXGBE, M_ZERO | M_NOWAIT); if (td == NULL) return (ENOMEM); /* List of TOE PCBs and associated lock */ mtx_init(&td->toep_list_lock, "PCB list lock", NULL, MTX_DEF); TAILQ_INIT(&td->toep_list); TAILQ_INIT(&td->synqe_list); TAILQ_INIT(&td->stranded_atids); TAILQ_INIT(&td->stranded_tids); TASK_INIT(&td->cleanup_stranded_tids, 0, cleanup_stranded_tids, td); /* Listen context */ mtx_init(&td->lctx_hash_lock, "lctx hash lock", NULL, MTX_DEF); td->listen_hash = hashinit_flags(LISTEN_HASH_SIZE, M_CXGBE, &td->listen_mask, HASH_NOWAIT); /* List of WRs for which L2 resolution failed */ mtx_init(&td->unsent_wr_lock, "Unsent WR list lock", NULL, MTX_DEF); STAILQ_INIT(&td->unsent_wr_list); TASK_INIT(&td->reclaim_wr_resources, 0, reclaim_wr_resources, td); /* TID tables */ rc = alloc_tid_tabs(sc); if (rc != 0) goto done; rc = t4_init_ppod_region(&td->pr, &sc->vres.ddp, t4_read_reg(sc, A_ULP_RX_TDDP_PSZ), "TDDP page pods"); if (rc != 0) goto done; t4_set_reg_field(sc, A_ULP_RX_TDDP_TAGMASK, V_TDDPTAGMASK(M_TDDPTAGMASK), td->pr.pr_tag_mask); alloc_tcb_history(sc, td); /* toedev ops */ tod = &td->tod; init_toedev(tod); tod->tod_softc = sc; tod->tod_connect = t4_connect; tod->tod_listen_start = t4_listen_start; tod->tod_listen_stop = t4_listen_stop; tod->tod_rcvd = t4_rcvd; tod->tod_output = t4_tod_output; tod->tod_send_rst = t4_send_rst; tod->tod_send_fin = t4_send_fin; tod->tod_pcb_detach = t4_pcb_detach; tod->tod_l2_update = t4_l2_update; tod->tod_syncache_added = t4_syncache_added; tod->tod_syncache_removed = t4_syncache_removed; tod->tod_syncache_respond = t4_syncache_respond; tod->tod_offload_socket = t4_offload_socket; tod->tod_ctloutput = t4_ctloutput; tod->tod_tcp_info = t4_tcp_info; #ifdef KERN_TLS tod->tod_alloc_tls_session = t4_alloc_tls_session; #endif tod->tod_pmtu_update = t4_pmtu_update; for_each_port(sc, i) { for_each_vi(sc->port[i], v, vi) { SETTOEDEV(vi->ifp, &td->tod); } } sc->tom_softc = td; register_toedev(sc->tom_softc); done: if (rc != 0) free_tom_data(sc, td); return (rc); } static int t4_tom_deactivate(struct adapter *sc) { int rc = 0, i, v; struct tom_data *td = sc->tom_softc; struct vi_info *vi; ASSERT_SYNCHRONIZED_OP(sc); if (td == NULL) return (0); /* XXX. KASSERT? */ if (uld_active(sc, ULD_IWARP) || uld_active(sc, ULD_ISCSI)) return (EBUSY); /* both iWARP and iSCSI rely on the TOE. */ if (sc->offload_map != 0) { for_each_port(sc, i) { for_each_vi(sc->port[i], v, vi) { toe_capability(vi, false); if_setcapenablebit(vi->ifp, 0, IFCAP_TOE); SETTOEDEV(vi->ifp, NULL); } } MPASS(sc->offload_map == 0); } mtx_lock(&td->toep_list_lock); if (!TAILQ_EMPTY(&td->toep_list)) rc = EBUSY; MPASS(TAILQ_EMPTY(&td->synqe_list)); MPASS(TAILQ_EMPTY(&td->stranded_tids)); mtx_unlock(&td->toep_list_lock); mtx_lock(&td->lctx_hash_lock); if (td->lctx_count > 0) rc = EBUSY; mtx_unlock(&td->lctx_hash_lock); taskqueue_drain(taskqueue_thread, &td->reclaim_wr_resources); taskqueue_drain(taskqueue_thread, &td->cleanup_stranded_tids); mtx_lock(&td->unsent_wr_lock); if (!STAILQ_EMPTY(&td->unsent_wr_list)) rc = EBUSY; mtx_unlock(&td->unsent_wr_lock); if (rc == 0) { unregister_toedev(sc->tom_softc); free_tom_data(sc, td); sc->tom_softc = NULL; } return (rc); } static void stop_atids(struct adapter *sc) { struct tom_data *td = sc->tom_softc; struct tid_info *t = &sc->tids; struct toepcb *toep; int atid; /* * Hashfilters and T6-KTLS are the only other users of atids but they're * both mutually exclusive with TOE. That means t4_tom owns all the * atids in the table. */ MPASS(!is_hashfilter(sc)); if (is_t6(sc)) MPASS(!(sc->flags & KERN_TLS_ON)); /* New atids are not being allocated. */ #ifdef INVARIANTS mtx_lock(&t->atid_lock); MPASS(t->atid_alloc_stopped == true); mtx_unlock(&t->atid_lock); #endif /* * In-use atids fall in one of these two categories: * a) Those waiting for L2 resolution before being submitted to * hardware. * b) Those that have been submitted to hardware and are awaiting * replies that will never arrive because the LLD is stopped. */ for (atid = 0; atid < t->natids; atid++) { toep = lookup_atid(sc, atid); if ((uintptr_t)toep >= (uintptr_t)&t->atid_tab[0] && (uintptr_t)toep < (uintptr_t)&t->atid_tab[t->natids]) continue; if (__predict_false(toep == NULL)) continue; MPASS(toep->tid == atid); MPASS(toep->incarnation == sc->incarnation); /* * Take the atid out of the lookup table. toep->tid is stale * after this but useful for debug. */ CTR(KTR_CXGBE, "%s: atid %d@%d STRANDED, removed from table", __func__, atid, toep->incarnation); free_atid(sc, toep->tid); #if 0 toep->tid = -1; #endif mtx_lock(&td->toep_list_lock); toep->flags &= ~TPF_IN_TOEP_LIST; TAILQ_REMOVE(&td->toep_list, toep, link); TAILQ_INSERT_TAIL(&td->stranded_atids, toep, link); mtx_unlock(&td->toep_list_lock); } MPASS(atomic_load_int(&t->atids_in_use) == 0); } static void stop_tids(struct adapter *sc) { struct tom_data *td = sc->tom_softc; struct toepcb *toep; #ifdef INVARIANTS struct tid_info *t = &sc->tids; #endif /* * The LLD's offload queues are stopped so do_act_establish and * do_pass_accept_req cannot run and insert tids in parallel with this * thread. stop_stid_tab has also run and removed the synq entries' * tids from the table. The only tids in the table are for connections * at or beyond ESTABLISHED that are still waiting for the final CPL. */ mtx_lock(&td->toep_list_lock); TAILQ_FOREACH(toep, &td->toep_list, link) { MPASS(sc->incarnation == toep->incarnation); MPASS(toep->tid >= 0); MPASS(toep == lookup_tid(sc, toep->tid)); /* Remove tid from the lookup table immediately. */ CTR(KTR_CXGBE, "%s: tid %d@%d STRANDED, removed from table", __func__, toep->tid, toep->incarnation); remove_tid(sc, toep->tid, toep->ce ? 2 : 1); #if 0 /* toep->tid is stale now but left alone for debug. */ toep->tid = -1; #endif /* All toep in this list will get bulk moved to stranded_tids */ toep->flags &= ~TPF_IN_TOEP_LIST; } MPASS(TAILQ_EMPTY(&td->stranded_tids)); TAILQ_CONCAT(&td->stranded_tids, &td->toep_list, link); MPASS(TAILQ_EMPTY(&td->toep_list)); mtx_unlock(&td->toep_list_lock); MPASS(atomic_load_int(&t->tids_in_use) == 0); } /* * L2T is stable because * 1. stop_lld stopped all new allocations. * 2. stop_lld also stopped the tx wrq so nothing is enqueueing new WRs to the * queue or to l2t_entry->wr_list. * 3. t4_l2t_update is ignoring all L2 updates. */ static void stop_tom_l2t(struct adapter *sc) { struct l2t_data *d = sc->l2t; struct tom_data *td = sc->tom_softc; struct l2t_entry *e; struct wrqe *wr; int i; /* * This task cannot be enqueued because L2 state changes are not being * processed. But if it's already scheduled or running then we need to * wait for it to cleanup the atids in the unsent_wr_list. */ taskqueue_drain(taskqueue_thread, &td->reclaim_wr_resources); MPASS(STAILQ_EMPTY(&td->unsent_wr_list)); for (i = 0; i < d->l2t_size; i++) { e = &d->l2tab[i]; mtx_lock(&e->lock); if (e->state == L2T_STATE_VALID || e->state == L2T_STATE_STALE) e->state = L2T_STATE_RESOLVING; /* * stop_atids is going to clean up _all_ atids in use, including * these that were pending L2 resolution. Just discard the WRs. */ while ((wr = STAILQ_FIRST(&e->wr_list)) != NULL) { STAILQ_REMOVE_HEAD(&e->wr_list, link); free(wr, M_CXGBE); } mtx_unlock(&e->lock); } } static int t4_tom_stop(struct adapter *sc) { struct tid_info *t = &sc->tids; struct tom_data *td = sc->tom_softc; ASSERT_SYNCHRONIZED_OP(sc); stop_tom_l2t(sc); if (atomic_load_int(&t->atids_in_use) > 0) stop_atids(sc); if (atomic_load_int(&t->stids_in_use) > 0) stop_stid_tab(sc); if (atomic_load_int(&t->tids_in_use) > 0) stop_tids(sc); taskqueue_enqueue(taskqueue_thread, &td->cleanup_stranded_tids); /* * L2T and atid_tab are restarted before t4_tom_restart so this assert * is not valid in t4_tom_restart. This is the next best place for it. */ MPASS(STAILQ_EMPTY(&td->unsent_wr_list)); return (0); } static int t4_tom_restart(struct adapter *sc) { ASSERT_SYNCHRONIZED_OP(sc); restart_stid_tab(sc); return (0); } static int t4_ctloutput_tom(struct socket *so, struct sockopt *sopt) { struct tcpcb *tp = sototcpcb(so); struct toepcb *toep = tp->t_toe; int error, optval; if (sopt->sopt_level == IPPROTO_TCP && sopt->sopt_name == TCP_USE_DDP) { if (sopt->sopt_dir != SOPT_SET) return (EOPNOTSUPP); if (sopt->sopt_td != NULL) { /* Only settable by the kernel. */ return (EPERM); } error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) return (error); if (optval != 0) return (t4_enable_ddp_rcv(so, toep)); else return (EOPNOTSUPP); } return (tcp_ctloutput(so, sopt)); } static int t4_aio_queue_tom(struct socket *so, struct kaiocb *job) { struct tcpcb *tp = sototcpcb(so); struct toepcb *toep = tp->t_toe; int error; /* * No lock is needed as TOE sockets never change between * active and passive. */ if (SOLISTENING(so)) return (EINVAL); if (ulp_mode(toep) == ULP_MODE_TCPDDP || ulp_mode(toep) == ULP_MODE_NONE) { error = t4_aio_queue_ddp(so, job); if (error == 0) return (0); else if (error != EOPNOTSUPP) return (soaio_queue_generic(so, job)); } if (t4_aio_queue_aiotx(so, job) != 0) return (soaio_queue_generic(so, job)); else return (0); } static int t4_tom_mod_load(void) { /* CPL handlers */ t4_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl); t4_register_shared_cpl_handler(CPL_L2T_WRITE_RPL, do_l2t_write_rpl2, CPL_COOKIE_TOM); t4_init_connect_cpl_handlers(); t4_init_listen_cpl_handlers(); t4_init_cpl_io_handlers(); t4_ddp_mod_load(); t4_tls_mod_load(); bcopy(&tcp_protosw, &toe_protosw, sizeof(toe_protosw)); toe_protosw.pr_ctloutput = t4_ctloutput_tom; toe_protosw.pr_aio_queue = t4_aio_queue_tom; bcopy(&tcp6_protosw, &toe6_protosw, sizeof(toe6_protosw)); toe6_protosw.pr_ctloutput = t4_ctloutput_tom; toe6_protosw.pr_aio_queue = t4_aio_queue_tom; return (t4_register_uld(&tom_uld_info, ULD_TOM)); } static void tom_uninit(struct adapter *sc, void *arg) { bool *ok_to_unload = arg; if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4tomun")) return; /* Try to free resources (works only if no port has IFCAP_TOE) */ if (uld_active(sc, ULD_TOM) && t4_deactivate_uld(sc, ULD_TOM) != 0) *ok_to_unload = false; end_synchronized_op(sc, 0); } static int t4_tom_mod_unload(void) { bool ok_to_unload = true; t4_iterate(tom_uninit, &ok_to_unload); if (!ok_to_unload) return (EBUSY); if (t4_unregister_uld(&tom_uld_info, ULD_TOM) == EBUSY) return (EBUSY); t4_tls_mod_unload(); t4_ddp_mod_unload(); t4_uninit_connect_cpl_handlers(); t4_uninit_listen_cpl_handlers(); t4_uninit_cpl_io_handlers(); t4_register_shared_cpl_handler(CPL_L2T_WRITE_RPL, NULL, CPL_COOKIE_TOM); t4_register_cpl_handler(CPL_GET_TCB_RPL, NULL); return (0); } #endif /* TCP_OFFLOAD */ static int t4_tom_modevent(module_t mod, int cmd, void *arg) { int rc = 0; #ifdef TCP_OFFLOAD switch (cmd) { case MOD_LOAD: rc = t4_tom_mod_load(); break; case MOD_UNLOAD: rc = t4_tom_mod_unload(); break; default: rc = EINVAL; } #else printf("t4_tom: compiled without TCP_OFFLOAD support.\n"); rc = EOPNOTSUPP; #endif return (rc); } static moduledata_t t4_tom_moddata= { "t4_tom", t4_tom_modevent, 0 }; MODULE_VERSION(t4_tom, 1); MODULE_DEPEND(t4_tom, toecore, 1, 1, 1); MODULE_DEPEND(t4_tom, t4nex, 1, 1, 1); DECLARE_MODULE(t4_tom, t4_tom_moddata, SI_SUB_EXEC, SI_ORDER_ANY);