Index: head/sys/dev/cxgbe/tom/t4_tom.c =================================================================== --- head/sys/dev/cxgbe/tom/t4_tom.c (revision 328607) +++ head/sys/dev/cxgbe/tom/t4_tom.c (revision 328608) @@ -1,1282 +1,1281 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ratelimit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include +#include #include #include #ifdef TCP_OFFLOAD #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "common/t4_tcb.h" #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" static struct protosw toe_protosw; static struct pr_usrreqs toe_usrreqs; static struct protosw toe6_protosw; static struct pr_usrreqs toe6_usrreqs; /* Module ops */ static int t4_tom_mod_load(void); static int t4_tom_mod_unload(void); static int t4_tom_modevent(module_t, int, void *); /* ULD ops and helpers */ static int t4_tom_activate(struct adapter *); static int t4_tom_deactivate(struct adapter *); static struct uld_info tom_uld_info = { .uld_id = ULD_TOM, .activate = t4_tom_activate, .deactivate = t4_tom_deactivate, }; static void queue_tid_release(struct adapter *, int); static void release_offload_resources(struct toepcb *); static int alloc_tid_tabs(struct tid_info *); static void free_tid_tabs(struct tid_info *); static int add_lip(struct adapter *, struct in6_addr *); static int delete_lip(struct adapter *, struct in6_addr *); static struct clip_entry *search_lip(struct tom_data *, struct in6_addr *); static void init_clip_table(struct adapter *, struct tom_data *); static void update_clip(struct adapter *, void *); static void t4_clip_task(void *, int); static void update_clip_table(struct adapter *, struct tom_data *); static void destroy_clip_table(struct adapter *, struct tom_data *); static void free_tom_data(struct adapter *, struct tom_data *); static void reclaim_wr_resources(void *, int); static int in6_ifaddr_gen; static eventhandler_tag ifaddr_evhandler; static struct timeout_task clip_task; struct toepcb * alloc_toepcb(struct vi_info *vi, int txqid, int rxqid, int flags) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct toepcb *toep; int tx_credits, txsd_total, len; /* * The firmware counts tx work request credits in units of 16 bytes * each. Reserve room for an ABORT_REQ so the driver never has to worry * about tx credits if it wants to abort a connection. */ tx_credits = sc->params.ofldq_wr_cred; tx_credits -= howmany(sizeof(struct cpl_abort_req), 16); /* * Shortest possible tx work request is a fw_ofld_tx_data_wr + 1 byte * immediate payload, and firmware counts tx work request credits in * units of 16 byte. Calculate the maximum work requests possible. */ txsd_total = tx_credits / howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16); if (txqid < 0) txqid = (arc4random() % vi->nofldtxq) + vi->first_ofld_txq; KASSERT(txqid >= vi->first_ofld_txq && txqid < vi->first_ofld_txq + vi->nofldtxq, ("%s: txqid %d for vi %p (first %d, n %d)", __func__, txqid, vi, vi->first_ofld_txq, vi->nofldtxq)); if (rxqid < 0) rxqid = (arc4random() % vi->nofldrxq) + vi->first_ofld_rxq; KASSERT(rxqid >= vi->first_ofld_rxq && rxqid < vi->first_ofld_rxq + vi->nofldrxq, ("%s: rxqid %d for vi %p (first %d, n %d)", __func__, rxqid, vi, vi->first_ofld_rxq, vi->nofldrxq)); len = offsetof(struct toepcb, txsd) + txsd_total * sizeof(struct ofld_tx_sdesc); toep = malloc(len, M_CXGBE, M_ZERO | flags); if (toep == NULL) return (NULL); refcount_init(&toep->refcount, 1); toep->td = sc->tom_softc; toep->vi = vi; toep->tc_idx = -1; toep->tx_total = tx_credits; toep->tx_credits = tx_credits; toep->ofld_txq = &sc->sge.ofld_txq[txqid]; toep->ofld_rxq = &sc->sge.ofld_rxq[rxqid]; toep->ctrlq = &sc->sge.ctrlq[pi->port_id]; mbufq_init(&toep->ulp_pduq, INT_MAX); mbufq_init(&toep->ulp_pdu_reclaimq, INT_MAX); toep->txsd_total = txsd_total; toep->txsd_avail = txsd_total; toep->txsd_pidx = 0; toep->txsd_cidx = 0; aiotx_init_toep(toep); ddp_init_toep(toep); return (toep); } struct toepcb * hold_toepcb(struct toepcb *toep) { refcount_acquire(&toep->refcount); return (toep); } void free_toepcb(struct toepcb *toep) { if (refcount_release(&toep->refcount) == 0) return; KASSERT(!(toep->flags & TPF_ATTACHED), ("%s: attached to an inpcb", __func__)); KASSERT(!(toep->flags & TPF_CPL_PENDING), ("%s: CPL pending", __func__)); ddp_uninit_toep(toep); free(toep, M_CXGBE); } /* * Set up the socket for TCP offload. */ void offload_socket(struct socket *so, struct toepcb *toep) { struct tom_data *td = toep->td; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct sockbuf *sb; INP_WLOCK_ASSERT(inp); /* Update socket */ sb = &so->so_snd; SOCKBUF_LOCK(sb); sb->sb_flags |= SB_NOCOALESCE; SOCKBUF_UNLOCK(sb); sb = &so->so_rcv; SOCKBUF_LOCK(sb); sb->sb_flags |= SB_NOCOALESCE; if (inp->inp_vflag & INP_IPV6) so->so_proto = &toe6_protosw; else so->so_proto = &toe_protosw; SOCKBUF_UNLOCK(sb); /* Update TCP PCB */ tp->tod = &td->tod; tp->t_toe = toep; tp->t_flags |= TF_TOE; /* Install an extra hold on inp */ toep->inp = inp; toep->flags |= TPF_ATTACHED; in_pcbref(inp); /* Add the TOE PCB to the active list */ mtx_lock(&td->toep_list_lock); TAILQ_INSERT_HEAD(&td->toep_list, toep, link); mtx_unlock(&td->toep_list_lock); } /* This is _not_ the normal way to "unoffload" a socket. */ void undo_offload_socket(struct socket *so) { struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct toepcb *toep = tp->t_toe; struct tom_data *td = toep->td; struct sockbuf *sb; INP_WLOCK_ASSERT(inp); sb = &so->so_snd; SOCKBUF_LOCK(sb); sb->sb_flags &= ~SB_NOCOALESCE; SOCKBUF_UNLOCK(sb); sb = &so->so_rcv; SOCKBUF_LOCK(sb); sb->sb_flags &= ~SB_NOCOALESCE; SOCKBUF_UNLOCK(sb); tp->tod = NULL; tp->t_toe = NULL; tp->t_flags &= ~TF_TOE; toep->inp = NULL; toep->flags &= ~TPF_ATTACHED; if (in_pcbrele_wlocked(inp)) panic("%s: inp freed.", __func__); mtx_lock(&td->toep_list_lock); TAILQ_REMOVE(&td->toep_list, toep, link); mtx_unlock(&td->toep_list_lock); } static void release_offload_resources(struct toepcb *toep) { struct tom_data *td = toep->td; struct adapter *sc = td_adapter(td); int tid = toep->tid; KASSERT(!(toep->flags & TPF_CPL_PENDING), ("%s: %p has CPL pending.", __func__, toep)); KASSERT(!(toep->flags & TPF_ATTACHED), ("%s: %p is still attached.", __func__, toep)); CTR5(KTR_CXGBE, "%s: toep %p (tid %d, l2te %p, ce %p)", __func__, toep, tid, toep->l2te, toep->ce); /* * These queues should have been emptied at approximately the same time * that a normal connection's socket's so_snd would have been purged or * drained. Do _not_ clean up here. */ MPASS(mbufq_len(&toep->ulp_pduq) == 0); MPASS(mbufq_len(&toep->ulp_pdu_reclaimq) == 0); #ifdef INVARIANTS ddp_assert_empty(toep); #endif if (toep->l2te) t4_l2t_release(toep->l2te); if (tid >= 0) { remove_tid(sc, tid, toep->ce ? 2 : 1); release_tid(sc, tid, toep->ctrlq); } if (toep->ce) release_lip(td, toep->ce); #ifdef RATELIMIT if (toep->tc_idx != -1) t4_release_cl_rl_kbps(sc, toep->vi->pi->port_id, toep->tc_idx); #endif mtx_lock(&td->toep_list_lock); TAILQ_REMOVE(&td->toep_list, toep, link); mtx_unlock(&td->toep_list_lock); free_toepcb(toep); } /* * The kernel is done with the TCP PCB and this is our opportunity to unhook the * toepcb hanging off of it. If the TOE driver is also done with the toepcb (no * pending CPL) then it is time to release all resources tied to the toepcb. * * Also gets called when an offloaded active open fails and the TOM wants the * kernel to take the TCP PCB back. */ static void t4_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp) { #if defined(KTR) || defined(INVARIANTS) struct inpcb *inp = tp->t_inpcb; #endif struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(inp); KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); KASSERT(toep->flags & TPF_ATTACHED, ("%s: not attached", __func__)); #ifdef KTR if (tp->t_state == TCPS_SYN_SENT) { CTR6(KTR_CXGBE, "%s: atid %d, toep %p (0x%x), inp %p (0x%x)", __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags); } else { CTR6(KTR_CXGBE, "t4_pcb_detach: tid %d (%s), toep %p (0x%x), inp %p (0x%x)", toep->tid, tcpstates[tp->t_state], toep, toep->flags, inp, inp->inp_flags); } #endif tp->t_toe = NULL; tp->t_flags &= ~TF_TOE; toep->flags &= ~TPF_ATTACHED; if (!(toep->flags & TPF_CPL_PENDING)) release_offload_resources(toep); } /* * setsockopt handler. */ static void t4_ctloutput(struct toedev *tod, struct tcpcb *tp, int dir, int name) { struct adapter *sc = tod->tod_softc; struct toepcb *toep = tp->t_toe; if (dir == SOPT_GET) return; CTR4(KTR_CXGBE, "%s: tp %p, dir %u, name %u", __func__, tp, dir, name); switch (name) { case TCP_NODELAY: if (tp->t_state != TCPS_ESTABLISHED) break; t4_set_tcb_field(sc, toep->ctrlq, toep->tid, W_TCB_T_FLAGS, V_TF_NAGLE(1), V_TF_NAGLE(tp->t_flags & TF_NODELAY ? 0 : 1), 0, 0, toep->ofld_rxq->iq.abs_id); break; default: break; } } /* * The TOE driver will not receive any more CPLs for the tid associated with the * toepcb; release the hold on the inpcb. */ void final_cpl_received(struct toepcb *toep) { struct inpcb *inp = toep->inp; KASSERT(inp != NULL, ("%s: inp is NULL", __func__)); INP_WLOCK_ASSERT(inp); KASSERT(toep->flags & TPF_CPL_PENDING, ("%s: CPL not pending already?", __func__)); CTR6(KTR_CXGBE, "%s: tid %d, toep %p (0x%x), inp %p (0x%x)", __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags); if (toep->ulp_mode == ULP_MODE_TCPDDP) release_ddp_resources(toep); toep->inp = NULL; toep->flags &= ~TPF_CPL_PENDING; mbufq_drain(&toep->ulp_pdu_reclaimq); if (!(toep->flags & TPF_ATTACHED)) release_offload_resources(toep); if (!in_pcbrele_wlocked(inp)) INP_WUNLOCK(inp); } void insert_tid(struct adapter *sc, int tid, void *ctx, int ntids) { struct tid_info *t = &sc->tids; t->tid_tab[tid] = ctx; atomic_add_int(&t->tids_in_use, ntids); } void * lookup_tid(struct adapter *sc, int tid) { struct tid_info *t = &sc->tids; return (t->tid_tab[tid]); } void update_tid(struct adapter *sc, int tid, void *ctx) { struct tid_info *t = &sc->tids; t->tid_tab[tid] = ctx; } void remove_tid(struct adapter *sc, int tid, int ntids) { struct tid_info *t = &sc->tids; t->tid_tab[tid] = NULL; atomic_subtract_int(&t->tids_in_use, ntids); } void release_tid(struct adapter *sc, int tid, struct sge_wrq *ctrlq) { struct wrqe *wr; struct cpl_tid_release *req; wr = alloc_wrqe(sizeof(*req), ctrlq); if (wr == NULL) { queue_tid_release(sc, tid); /* defer */ return; } req = wrtod(wr); INIT_TP_WR_MIT_CPL(req, CPL_TID_RELEASE, tid); t4_wrq_tx(sc, wr); } static void queue_tid_release(struct adapter *sc, int tid) { CXGBE_UNIMPLEMENTED("deferred tid release"); } /* * What mtu_idx to use, given a 4-tuple and/or an MSS cap */ int find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, int pmss) { unsigned short *mtus = &sc->params.mtus[0]; int i, mss, n; KASSERT(inc != NULL || pmss > 0, ("%s: at least one of inc/pmss must be specified", __func__)); mss = inc ? tcp_mssopt(inc) : pmss; if (pmss > 0 && mss > pmss) mss = pmss; if (inc->inc_flags & INC_ISIPV6) n = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else n = sizeof(struct ip) + sizeof(struct tcphdr); for (i = 0; i < NMTUS - 1 && mtus[i + 1] <= mss + n; i++) continue; return (i); } /* * Determine the receive window size for a socket. */ u_long select_rcv_wnd(struct socket *so) { unsigned long wnd; SOCKBUF_LOCK_ASSERT(&so->so_rcv); wnd = sbspace(&so->so_rcv); if (wnd < MIN_RCV_WND) wnd = MIN_RCV_WND; return min(wnd, MAX_RCV_WND); } int select_rcv_wscale(void) { int wscale = 0; unsigned long space = sb_max; if (space > MAX_RCV_WND) space = MAX_RCV_WND; while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space) wscale++; return (wscale); } -extern int always_keepalive; - /* * socket so could be a listening socket too. */ uint64_t calc_opt0(struct socket *so, struct vi_info *vi, struct l2t_entry *e, int mtu_idx, int rscale, int rx_credits, int ulp_mode) { uint64_t opt0; KASSERT(rx_credits <= M_RCV_BUFSIZ, ("%s: rcv_bufsiz too high", __func__)); opt0 = F_TCAM_BYPASS | V_WND_SCALE(rscale) | V_MSS_IDX(mtu_idx) | V_ULP_MODE(ulp_mode) | V_RCV_BUFSIZ(rx_credits); if (so != NULL) { struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); - int keepalive = always_keepalive || + int keepalive = tcp_always_keepalive || so_options_get(so) & SO_KEEPALIVE; opt0 |= V_NAGLE((tp->t_flags & TF_NODELAY) == 0); opt0 |= V_KEEP_ALIVE(keepalive != 0); } if (e != NULL) opt0 |= V_L2T_IDX(e->idx); if (vi != NULL) { opt0 |= V_SMAC_SEL(vi->smt_idx); opt0 |= V_TX_CHAN(vi->pi->tx_chan); } return htobe64(opt0); } uint64_t select_ntuple(struct vi_info *vi, struct l2t_entry *e) { struct adapter *sc = vi->pi->adapter; struct tp_params *tp = &sc->params.tp; uint16_t viid = vi->viid; uint64_t ntuple = 0; /* * Initialize each of the fields which we care about which are present * in the Compressed Filter Tuple. */ if (tp->vlan_shift >= 0 && e->vlan != CPL_L2T_VLAN_NONE) ntuple |= (uint64_t)(F_FT_VLAN_VLD | e->vlan) << tp->vlan_shift; if (tp->port_shift >= 0) ntuple |= (uint64_t)e->lport << tp->port_shift; if (tp->protocol_shift >= 0) ntuple |= (uint64_t)IPPROTO_TCP << tp->protocol_shift; if (tp->vnic_shift >= 0) { uint32_t vf = G_FW_VIID_VIN(viid); uint32_t pf = G_FW_VIID_PFN(viid); uint32_t vld = G_FW_VIID_VIVLD(viid); ntuple |= (uint64_t)(V_FT_VNID_ID_VF(vf) | V_FT_VNID_ID_PF(pf) | V_FT_VNID_ID_VLD(vld)) << tp->vnic_shift; } if (is_t4(sc)) return (htobe32((uint32_t)ntuple)); else return (htobe64(V_FILTER_TUPLE(ntuple))); } void set_tcpddp_ulp_mode(struct toepcb *toep) { toep->ulp_mode = ULP_MODE_TCPDDP; toep->ddp_flags = DDP_OK; } int negative_advice(int status) { return (status == CPL_ERR_RTX_NEG_ADVICE || status == CPL_ERR_PERSIST_NEG_ADVICE || status == CPL_ERR_KEEPALV_NEG_ADVICE); } static int alloc_tid_tabs(struct tid_info *t) { size_t size; unsigned int i; size = t->ntids * sizeof(*t->tid_tab) + t->natids * sizeof(*t->atid_tab) + t->nstids * sizeof(*t->stid_tab); t->tid_tab = malloc(size, M_CXGBE, M_ZERO | M_NOWAIT); if (t->tid_tab == NULL) return (ENOMEM); mtx_init(&t->atid_lock, "atid lock", NULL, MTX_DEF); t->atid_tab = (union aopen_entry *)&t->tid_tab[t->ntids]; t->afree = t->atid_tab; t->atids_in_use = 0; for (i = 1; i < t->natids; i++) t->atid_tab[i - 1].next = &t->atid_tab[i]; t->atid_tab[t->natids - 1].next = NULL; mtx_init(&t->stid_lock, "stid lock", NULL, MTX_DEF); t->stid_tab = (struct listen_ctx **)&t->atid_tab[t->natids]; t->stids_in_use = 0; TAILQ_INIT(&t->stids); t->nstids_free_head = t->nstids; atomic_store_rel_int(&t->tids_in_use, 0); return (0); } static void free_tid_tabs(struct tid_info *t) { KASSERT(t->tids_in_use == 0, ("%s: %d tids still in use.", __func__, t->tids_in_use)); KASSERT(t->atids_in_use == 0, ("%s: %d atids still in use.", __func__, t->atids_in_use)); KASSERT(t->stids_in_use == 0, ("%s: %d tids still in use.", __func__, t->stids_in_use)); free(t->tid_tab, M_CXGBE); t->tid_tab = NULL; if (mtx_initialized(&t->atid_lock)) mtx_destroy(&t->atid_lock); if (mtx_initialized(&t->stid_lock)) mtx_destroy(&t->stid_lock); } static int add_lip(struct adapter *sc, struct in6_addr *lip) { struct fw_clip_cmd c; ASSERT_SYNCHRONIZED_OP(sc); /* mtx_assert(&td->clip_table_lock, MA_OWNED); */ memset(&c, 0, sizeof(c)); c.op_to_write = htonl(V_FW_CMD_OP(FW_CLIP_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE); c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_ALLOC | FW_LEN16(c)); c.ip_hi = *(uint64_t *)&lip->s6_addr[0]; c.ip_lo = *(uint64_t *)&lip->s6_addr[8]; return (-t4_wr_mbox_ns(sc, sc->mbox, &c, sizeof(c), &c)); } static int delete_lip(struct adapter *sc, struct in6_addr *lip) { struct fw_clip_cmd c; ASSERT_SYNCHRONIZED_OP(sc); /* mtx_assert(&td->clip_table_lock, MA_OWNED); */ memset(&c, 0, sizeof(c)); c.op_to_write = htonl(V_FW_CMD_OP(FW_CLIP_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ); c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_FREE | FW_LEN16(c)); c.ip_hi = *(uint64_t *)&lip->s6_addr[0]; c.ip_lo = *(uint64_t *)&lip->s6_addr[8]; return (-t4_wr_mbox_ns(sc, sc->mbox, &c, sizeof(c), &c)); } static struct clip_entry * search_lip(struct tom_data *td, struct in6_addr *lip) { struct clip_entry *ce; mtx_assert(&td->clip_table_lock, MA_OWNED); TAILQ_FOREACH(ce, &td->clip_table, link) { if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip)) return (ce); } return (NULL); } struct clip_entry * hold_lip(struct tom_data *td, struct in6_addr *lip, struct clip_entry *ce) { mtx_lock(&td->clip_table_lock); if (ce == NULL) ce = search_lip(td, lip); if (ce != NULL) ce->refcount++; mtx_unlock(&td->clip_table_lock); return (ce); } void release_lip(struct tom_data *td, struct clip_entry *ce) { mtx_lock(&td->clip_table_lock); KASSERT(search_lip(td, &ce->lip) == ce, ("%s: CLIP entry %p p not in CLIP table.", __func__, ce)); KASSERT(ce->refcount > 0, ("%s: CLIP entry %p has refcount 0", __func__, ce)); --ce->refcount; mtx_unlock(&td->clip_table_lock); } static void init_clip_table(struct adapter *sc, struct tom_data *td) { ASSERT_SYNCHRONIZED_OP(sc); mtx_init(&td->clip_table_lock, "CLIP table lock", NULL, MTX_DEF); TAILQ_INIT(&td->clip_table); td->clip_gen = -1; update_clip_table(sc, td); } static void update_clip(struct adapter *sc, void *arg __unused) { if (begin_synchronized_op(sc, NULL, HOLD_LOCK, "t4tomuc")) return; if (uld_active(sc, ULD_TOM)) update_clip_table(sc, sc->tom_softc); end_synchronized_op(sc, LOCK_HELD); } static void t4_clip_task(void *arg, int count) { t4_iterate(update_clip, NULL); } static void update_clip_table(struct adapter *sc, struct tom_data *td) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; struct in6_addr *lip, tlip; struct clip_head stale; struct clip_entry *ce, *ce_temp; struct vi_info *vi; int rc, gen, i, j; uintptr_t last_vnet; ASSERT_SYNCHRONIZED_OP(sc); IN6_IFADDR_RLOCK(&in6_ifa_tracker); mtx_lock(&td->clip_table_lock); gen = atomic_load_acq_int(&in6_ifaddr_gen); if (gen == td->clip_gen) goto done; TAILQ_INIT(&stale); TAILQ_CONCAT(&stale, &td->clip_table, link); /* * last_vnet optimizes the common cases where all if_vnet = NULL (no * VIMAGE) or all if_vnet = vnet0. */ last_vnet = (uintptr_t)(-1); for_each_port(sc, i) for_each_vi(sc->port[i], j, vi) { if (last_vnet == (uintptr_t)vi->ifp->if_vnet) continue; /* XXX: races with if_vmove */ CURVNET_SET(vi->ifp->if_vnet); TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { lip = &ia->ia_addr.sin6_addr; KASSERT(!IN6_IS_ADDR_MULTICAST(lip), ("%s: mcast address in in6_ifaddr list", __func__)); if (IN6_IS_ADDR_LOOPBACK(lip)) continue; if (IN6_IS_SCOPE_EMBED(lip)) { /* Remove the embedded scope */ tlip = *lip; lip = &tlip; in6_clearscope(lip); } /* * XXX: how to weed out the link local address for the * loopback interface? It's fe80::1 usually (always?). */ /* * If it's in the main list then we already know it's * not stale. */ TAILQ_FOREACH(ce, &td->clip_table, link) { if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip)) goto next; } /* * If it's in the stale list we should move it to the * main list. */ TAILQ_FOREACH(ce, &stale, link) { if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip)) { TAILQ_REMOVE(&stale, ce, link); TAILQ_INSERT_TAIL(&td->clip_table, ce, link); goto next; } } /* A new IP6 address; add it to the CLIP table */ ce = malloc(sizeof(*ce), M_CXGBE, M_NOWAIT); memcpy(&ce->lip, lip, sizeof(ce->lip)); ce->refcount = 0; rc = add_lip(sc, lip); if (rc == 0) TAILQ_INSERT_TAIL(&td->clip_table, ce, link); else { char ip[INET6_ADDRSTRLEN]; inet_ntop(AF_INET6, &ce->lip, &ip[0], sizeof(ip)); log(LOG_ERR, "%s: could not add %s (%d)\n", __func__, ip, rc); free(ce, M_CXGBE); } next: continue; } CURVNET_RESTORE(); last_vnet = (uintptr_t)vi->ifp->if_vnet; } /* * Remove stale addresses (those no longer in V_in6_ifaddrhead) that are * no longer referenced by the driver. */ TAILQ_FOREACH_SAFE(ce, &stale, link, ce_temp) { if (ce->refcount == 0) { rc = delete_lip(sc, &ce->lip); if (rc == 0) { TAILQ_REMOVE(&stale, ce, link); free(ce, M_CXGBE); } else { char ip[INET6_ADDRSTRLEN]; inet_ntop(AF_INET6, &ce->lip, &ip[0], sizeof(ip)); log(LOG_ERR, "%s: could not delete %s (%d)\n", __func__, ip, rc); } } } /* The ones that are still referenced need to stay in the CLIP table */ TAILQ_CONCAT(&td->clip_table, &stale, link); td->clip_gen = gen; done: mtx_unlock(&td->clip_table_lock); IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); } static void destroy_clip_table(struct adapter *sc, struct tom_data *td) { struct clip_entry *ce, *ce_temp; if (mtx_initialized(&td->clip_table_lock)) { mtx_lock(&td->clip_table_lock); TAILQ_FOREACH_SAFE(ce, &td->clip_table, link, ce_temp) { KASSERT(ce->refcount == 0, ("%s: CLIP entry %p still in use (%d)", __func__, ce, ce->refcount)); TAILQ_REMOVE(&td->clip_table, ce, link); delete_lip(sc, &ce->lip); free(ce, M_CXGBE); } mtx_unlock(&td->clip_table_lock); mtx_destroy(&td->clip_table_lock); } } static void free_tom_data(struct adapter *sc, struct tom_data *td) { ASSERT_SYNCHRONIZED_OP(sc); KASSERT(TAILQ_EMPTY(&td->toep_list), ("%s: TOE PCB list is not empty.", __func__)); KASSERT(td->lctx_count == 0, ("%s: lctx hash table is not empty.", __func__)); t4_free_ppod_region(&td->pr); destroy_clip_table(sc, td); if (td->listen_mask != 0) hashdestroy(td->listen_hash, M_CXGBE, td->listen_mask); if (mtx_initialized(&td->unsent_wr_lock)) mtx_destroy(&td->unsent_wr_lock); if (mtx_initialized(&td->lctx_hash_lock)) mtx_destroy(&td->lctx_hash_lock); if (mtx_initialized(&td->toep_list_lock)) mtx_destroy(&td->toep_list_lock); free_tid_tabs(&sc->tids); free(td, M_CXGBE); } static void reclaim_wr_resources(void *arg, int count) { struct tom_data *td = arg; STAILQ_HEAD(, wrqe) twr_list = STAILQ_HEAD_INITIALIZER(twr_list); struct cpl_act_open_req *cpl; u_int opcode, atid; struct wrqe *wr; struct adapter *sc; mtx_lock(&td->unsent_wr_lock); STAILQ_SWAP(&td->unsent_wr_list, &twr_list, wrqe); mtx_unlock(&td->unsent_wr_lock); while ((wr = STAILQ_FIRST(&twr_list)) != NULL) { STAILQ_REMOVE_HEAD(&twr_list, link); cpl = wrtod(wr); opcode = GET_OPCODE(cpl); switch (opcode) { case CPL_ACT_OPEN_REQ: case CPL_ACT_OPEN_REQ6: atid = G_TID_TID(be32toh(OPCODE_TID(cpl))); sc = td_adapter(td); CTR2(KTR_CXGBE, "%s: atid %u ", __func__, atid); act_open_failure_cleanup(sc, atid, EHOSTUNREACH); free(wr, M_CXGBE); break; default: log(LOG_ERR, "%s: leaked work request %p, wr_len %d, " "opcode %x\n", __func__, wr, wr->wr_len, opcode); /* WR not freed here; go look at it with a debugger. */ } } } /* * Ground control to Major TOM * Commencing countdown, engines on */ static int t4_tom_activate(struct adapter *sc) { struct tom_data *td; struct toedev *tod; struct vi_info *vi; struct sge_ofld_rxq *ofld_rxq; int i, j, rc, v; ASSERT_SYNCHRONIZED_OP(sc); /* per-adapter softc for TOM */ td = malloc(sizeof(*td), M_CXGBE, M_ZERO | M_NOWAIT); if (td == NULL) return (ENOMEM); /* List of TOE PCBs and associated lock */ mtx_init(&td->toep_list_lock, "PCB list lock", NULL, MTX_DEF); TAILQ_INIT(&td->toep_list); /* Listen context */ mtx_init(&td->lctx_hash_lock, "lctx hash lock", NULL, MTX_DEF); td->listen_hash = hashinit_flags(LISTEN_HASH_SIZE, M_CXGBE, &td->listen_mask, HASH_NOWAIT); /* List of WRs for which L2 resolution failed */ mtx_init(&td->unsent_wr_lock, "Unsent WR list lock", NULL, MTX_DEF); STAILQ_INIT(&td->unsent_wr_list); TASK_INIT(&td->reclaim_wr_resources, 0, reclaim_wr_resources, td); /* TID tables */ rc = alloc_tid_tabs(&sc->tids); if (rc != 0) goto done; rc = t4_init_ppod_region(&td->pr, &sc->vres.ddp, t4_read_reg(sc, A_ULP_RX_TDDP_PSZ), "TDDP page pods"); if (rc != 0) goto done; t4_set_reg_field(sc, A_ULP_RX_TDDP_TAGMASK, V_TDDPTAGMASK(M_TDDPTAGMASK), td->pr.pr_tag_mask); /* CLIP table for IPv6 offload */ init_clip_table(sc, td); /* toedev ops */ tod = &td->tod; init_toedev(tod); tod->tod_softc = sc; tod->tod_connect = t4_connect; tod->tod_listen_start = t4_listen_start; tod->tod_listen_stop = t4_listen_stop; tod->tod_rcvd = t4_rcvd; tod->tod_output = t4_tod_output; tod->tod_send_rst = t4_send_rst; tod->tod_send_fin = t4_send_fin; tod->tod_pcb_detach = t4_pcb_detach; tod->tod_l2_update = t4_l2_update; tod->tod_syncache_added = t4_syncache_added; tod->tod_syncache_removed = t4_syncache_removed; tod->tod_syncache_respond = t4_syncache_respond; tod->tod_offload_socket = t4_offload_socket; tod->tod_ctloutput = t4_ctloutput; for_each_port(sc, i) { for_each_vi(sc->port[i], v, vi) { TOEDEV(vi->ifp) = &td->tod; for_each_ofld_rxq(vi, j, ofld_rxq) { ofld_rxq->iq.set_tcb_rpl = do_set_tcb_rpl; ofld_rxq->iq.l2t_write_rpl = do_l2t_write_rpl2; } } } sc->tom_softc = td; register_toedev(sc->tom_softc); done: if (rc != 0) free_tom_data(sc, td); return (rc); } static int t4_tom_deactivate(struct adapter *sc) { int rc = 0; struct tom_data *td = sc->tom_softc; ASSERT_SYNCHRONIZED_OP(sc); if (td == NULL) return (0); /* XXX. KASSERT? */ if (sc->offload_map != 0) return (EBUSY); /* at least one port has IFCAP_TOE enabled */ if (uld_active(sc, ULD_IWARP) || uld_active(sc, ULD_ISCSI)) return (EBUSY); /* both iWARP and iSCSI rely on the TOE. */ mtx_lock(&td->toep_list_lock); if (!TAILQ_EMPTY(&td->toep_list)) rc = EBUSY; mtx_unlock(&td->toep_list_lock); mtx_lock(&td->lctx_hash_lock); if (td->lctx_count > 0) rc = EBUSY; mtx_unlock(&td->lctx_hash_lock); taskqueue_drain(taskqueue_thread, &td->reclaim_wr_resources); mtx_lock(&td->unsent_wr_lock); if (!STAILQ_EMPTY(&td->unsent_wr_list)) rc = EBUSY; mtx_unlock(&td->unsent_wr_lock); if (rc == 0) { unregister_toedev(sc->tom_softc); free_tom_data(sc, td); sc->tom_softc = NULL; } return (rc); } static void t4_tom_ifaddr_event(void *arg __unused, struct ifnet *ifp) { atomic_add_rel_int(&in6_ifaddr_gen, 1); taskqueue_enqueue_timeout(taskqueue_thread, &clip_task, -hz / 4); } static int t4_aio_queue_tom(struct socket *so, struct kaiocb *job) { struct tcpcb *tp = so_sototcpcb(so); struct toepcb *toep = tp->t_toe; int error; if (toep->ulp_mode == ULP_MODE_TCPDDP) { error = t4_aio_queue_ddp(so, job); if (error != EOPNOTSUPP) return (error); } return (t4_aio_queue_aiotx(so, job)); } static int t4_tom_mod_load(void) { int rc; struct protosw *tcp_protosw, *tcp6_protosw; /* CPL handlers */ t4_init_connect_cpl_handlers(); t4_init_listen_cpl_handlers(); t4_init_cpl_io_handlers(); rc = t4_ddp_mod_load(); if (rc != 0) return (rc); tcp_protosw = pffindproto(PF_INET, IPPROTO_TCP, SOCK_STREAM); if (tcp_protosw == NULL) return (ENOPROTOOPT); bcopy(tcp_protosw, &toe_protosw, sizeof(toe_protosw)); bcopy(tcp_protosw->pr_usrreqs, &toe_usrreqs, sizeof(toe_usrreqs)); toe_usrreqs.pru_aio_queue = t4_aio_queue_tom; toe_protosw.pr_usrreqs = &toe_usrreqs; tcp6_protosw = pffindproto(PF_INET6, IPPROTO_TCP, SOCK_STREAM); if (tcp6_protosw == NULL) return (ENOPROTOOPT); bcopy(tcp6_protosw, &toe6_protosw, sizeof(toe6_protosw)); bcopy(tcp6_protosw->pr_usrreqs, &toe6_usrreqs, sizeof(toe6_usrreqs)); toe6_usrreqs.pru_aio_queue = t4_aio_queue_tom; toe6_protosw.pr_usrreqs = &toe6_usrreqs; TIMEOUT_TASK_INIT(taskqueue_thread, &clip_task, 0, t4_clip_task, NULL); ifaddr_evhandler = EVENTHANDLER_REGISTER(ifaddr_event, t4_tom_ifaddr_event, NULL, EVENTHANDLER_PRI_ANY); rc = t4_register_uld(&tom_uld_info); if (rc != 0) t4_tom_mod_unload(); return (rc); } static void tom_uninit(struct adapter *sc, void *arg __unused) { if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4tomun")) return; /* Try to free resources (works only if no port has IFCAP_TOE) */ if (uld_active(sc, ULD_TOM)) t4_deactivate_uld(sc, ULD_TOM); end_synchronized_op(sc, 0); } static int t4_tom_mod_unload(void) { t4_iterate(tom_uninit, NULL); if (t4_unregister_uld(&tom_uld_info) == EBUSY) return (EBUSY); if (ifaddr_evhandler) { EVENTHANDLER_DEREGISTER(ifaddr_event, ifaddr_evhandler); taskqueue_cancel_timeout(taskqueue_thread, &clip_task, NULL); } t4_ddp_mod_unload(); t4_uninit_connect_cpl_handlers(); t4_uninit_listen_cpl_handlers(); t4_uninit_cpl_io_handlers(); return (0); } #endif /* TCP_OFFLOAD */ static int t4_tom_modevent(module_t mod, int cmd, void *arg) { int rc = 0; #ifdef TCP_OFFLOAD switch (cmd) { case MOD_LOAD: rc = t4_tom_mod_load(); break; case MOD_UNLOAD: rc = t4_tom_mod_unload(); break; default: rc = EINVAL; } #else printf("t4_tom: compiled without TCP_OFFLOAD support.\n"); rc = EOPNOTSUPP; #endif return (rc); } static moduledata_t t4_tom_moddata= { "t4_tom", t4_tom_modevent, 0 }; MODULE_VERSION(t4_tom, 1); MODULE_DEPEND(t4_tom, toecore, 1, 1, 1); MODULE_DEPEND(t4_tom, t4nex, 1, 1, 1); DECLARE_MODULE(t4_tom, t4_tom_moddata, SI_SUB_EXEC, SI_ORDER_ANY); Index: head/sys/netinet/tcp_timer.c =================================================================== --- head/sys/netinet/tcp_timer.c (revision 328607) +++ head/sys/netinet/tcp_timer.c (revision 328608) @@ -1,986 +1,987 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_tcpdebug.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #include #include #ifdef INET6 #include #endif #include #ifdef TCPDEBUG #include #endif int tcp_persmin; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW, &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval"); int tcp_persmax; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW, &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval"); int tcp_keepinit; SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection"); int tcp_keepidle; SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin"); int tcp_keepintvl; SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes"); int tcp_delacktime; SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW, &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", "Time before a delayed ACK is sent"); int tcp_msl; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); int tcp_rexmit_min; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW, &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", "Minimum Retransmission Timeout"); int tcp_rexmit_slop; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW, &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", "Retransmission Timer Slop"); -static int always_keepalive = 1; +int tcp_always_keepalive = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, - &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); + &tcp_always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); int tcp_fast_finwait2_recycle = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, &tcp_fast_finwait2_recycle, 0, "Recycle closed FIN_WAIT_2 connections faster"); int tcp_finwait2_timeout; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW, &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout"); int tcp_keepcnt = TCPTV_KEEPCNT; SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, "Number of keepalive probes to send"); /* max idle probes */ int tcp_maxpersistidle; static int tcp_rexmit_drop_options = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, &tcp_rexmit_drop_options, 0, "Drop TCP options from 3rd and later retransmitted SYN"); VNET_DEFINE(int, tcp_pmtud_blackhole_detect); SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, CTLFLAG_RW|CTLFLAG_VNET, &VNET_NAME(tcp_pmtud_blackhole_detect), 0, "Path MTU Discovery Black Hole Detection Enabled"); #ifdef INET VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, CTLFLAG_RW|CTLFLAG_VNET, &VNET_NAME(tcp_pmtud_blackhole_mss), 0, "Path MTU Discovery Black Hole Detection lowered MSS"); #endif #ifdef INET6 VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, CTLFLAG_RW|CTLFLAG_VNET, &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); #endif #ifdef RSS static int per_cpu_timers = 1; #else static int per_cpu_timers = 0; #endif SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, &per_cpu_timers , 0, "run tcp timers on all cpus"); #if 0 #define INP_CPU(inp) (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \ ((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0) #endif /* * Map the given inp to a CPU id. * * This queries RSS if it's compiled in, else it defaults to the current * CPU ID. */ static inline int inp_to_cpuid(struct inpcb *inp) { u_int cpuid; #ifdef RSS if (per_cpu_timers) { cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); if (cpuid == NETISR_CPUID_NONE) return (curcpu); /* XXX */ else return (cpuid); } #else /* Legacy, pre-RSS behaviour */ if (per_cpu_timers) { /* * We don't have a flowid -> cpuid mapping, so cheat and * just map unknown cpuids to curcpu. Not the best, but * apparently better than defaulting to swi 0. */ cpuid = inp->inp_flowid % (mp_maxid + 1); if (! CPU_ABSENT(cpuid)) return (cpuid); return (curcpu); } #endif /* Default for RSS and non-RSS - cpuid 0 */ else { return (0); } } /* * Tcp protocol timeout routine called every 500 ms. * Updates timestamps used for TCP * causes finite state machine actions if timers expire. */ void tcp_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); (void) tcp_tw_2msl_scan(0); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; int tcp_backoff[TCP_MAXRXTSHIFT + 1] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ /* * TCP timer processing. */ void tcp_timer_delack(void *xtp) { struct tcpcb *tp = xtp; struct inpcb *inp; CURVNET_SET(tp->t_vnet); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_delack) || !callout_active(&tp->t_timers->tt_delack)) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_delack); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_delack); (void) tp->t_fb->tfb_tcp_output(tp); INP_WUNLOCK(inp); CURVNET_RESTORE(); } /* * When a timer wants to remove a TCB it must * hold the INP_INFO_RLOCK(). The timer function * should only have grabbed the INP_WLOCK() when * it entered. To safely switch to holding both the * INP_INFO_RLOCK() and the INP_WLOCK() we must first * grab a reference on the inp, which will hold the inp * so that it can't be removed. We then unlock the INP_WLOCK(), * and grab the INP_INFO_RLOCK() lock. Once we have the INP_INFO_RLOCK() * we proceed again to get the INP_WLOCK() (this preserves proper * lock order). After acquiring the INP_WLOCK we must check if someone * else deleted the pcb i.e. the inp_flags check. * If so we return 1 otherwise we return 0. * * No matter what the tcp_inpinfo_lock_add() function * returns the caller must afterwards call tcp_inpinfo_lock_del() * to drop the locks and reference properly. */ int tcp_inpinfo_lock_add(struct inpcb *inp) { in_pcbref(inp); INP_WUNLOCK(inp); INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { return(1); } return(0); } void tcp_inpinfo_lock_del(struct inpcb *inp, struct tcpcb *tp) { INP_INFO_RUNLOCK(&V_tcbinfo); if (inp && (tp == NULL)) { /* * If tcp_close/drop() gets called and tp * returns NULL, then the function dropped * the inp lock, we hold a reference keeping * this around, so we must re-aquire the * INP_WLOCK() in order to proceed with * our dropping the inp reference. */ INP_WLOCK(inp); } if (inp && in_pcbrele_wlocked(inp) == 0) INP_WUNLOCK(inp); } void tcp_timer_2msl(void *xtp) { struct tcpcb *tp = xtp; struct inpcb *inp; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); tcp_free_sackholes(tp); if (callout_pending(&tp->t_timers->tt_2msl) || !callout_active(&tp->t_timers->tt_2msl)) { INP_WUNLOCK(tp->t_inpcb); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_2msl); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); /* * 2 MSL timeout in shutdown went off. If we're closed but * still waiting for peer to close and connection has been idle * too long delete connection control block. Otherwise, check * again in a bit. * * If in TIME_WAIT state just ignore as this timeout is handled in * tcp_tw_2msl_scan(). * * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. * Ignore fact that there were recent incoming segments. */ if ((inp->inp_flags & INP_TIMEWAIT) != 0) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 && tp->t_inpcb && tp->t_inpcb->inp_socket && (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { TCPSTAT_INC(tcps_finwait2_drops); if (tcp_inpinfo_lock_add(inp)) { tcp_inpinfo_lock_del(inp, tp); goto out; } tp = tcp_close(tp); tcp_inpinfo_lock_del(inp, tp); goto out; } else { if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) { callout_reset(&tp->t_timers->tt_2msl, TP_KEEPINTVL(tp), tcp_timer_2msl, tp); } else { if (tcp_inpinfo_lock_add(inp)) { tcp_inpinfo_lock_del(inp, tp); goto out; } tp = tcp_close(tp); tcp_inpinfo_lock_del(inp, tp); goto out; } } #ifdef TCPDEBUG if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); if (tp != NULL) INP_WUNLOCK(inp); out: CURVNET_RESTORE(); } void tcp_timer_keep(void *xtp) { struct tcpcb *tp = xtp; struct tcptemp *t_template; struct inpcb *inp; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_keep) || !callout_active(&tp->t_timers->tt_keep)) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_keep); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); /* * Because we don't regularly reset the keepalive callout in * the ESTABLISHED state, it may be that we don't actually need * to send a keepalive yet. If that occurs, schedule another * call for the next time the keepalive timer might expire. */ if (TCPS_HAVEESTABLISHED(tp->t_state)) { u_int idletime; idletime = ticks - tp->t_rcvtime; if (idletime < TP_KEEPIDLE(tp)) { callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp) - idletime, tcp_timer_keep, tp); INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } } /* * Keep-alive timer went off; send something * or drop connection if idle for too long. */ TCPSTAT_INC(tcps_keeptimeo); if (tp->t_state < TCPS_ESTABLISHED) goto dropit; - if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && + if ((tcp_always_keepalive || + inp->inp_socket->so_options & SO_KEEPALIVE) && tp->t_state <= TCPS_CLOSING) { if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) goto dropit; /* * Send a packet designed to force a response * if the peer is up and reachable: * either an ACK if the connection is still alive, * or an RST if the peer has closed the connection * due to timeout or reboot. * Using sequence number tp->snd_una-1 * causes the transmitted zero-length segment * to lie outside the receive window; * by the protocol spec, this requires the * correspondent TCP to respond. */ TCPSTAT_INC(tcps_keepprobe); t_template = tcpip_maketemplate(inp); if (t_template) { tcp_respond(tp, t_template->tt_ipgen, &t_template->tt_t, (struct mbuf *)NULL, tp->rcv_nxt, tp->snd_una - 1, 0); free(t_template, M_TEMP); } callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), tcp_timer_keep, tp); } else callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), tcp_timer_keep, tp); #ifdef TCPDEBUG if (inp->inp_socket->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); INP_WUNLOCK(inp); CURVNET_RESTORE(); return; dropit: TCPSTAT_INC(tcps_keepdrops); if (tcp_inpinfo_lock_add(inp)) { tcp_inpinfo_lock_del(inp, tp); goto out; } tp = tcp_drop(tp, ETIMEDOUT); #ifdef TCPDEBUG if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); tcp_inpinfo_lock_del(inp, tp); out: CURVNET_RESTORE(); } void tcp_timer_persist(void *xtp) { struct tcpcb *tp = xtp; struct inpcb *inp; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_persist) || !callout_active(&tp->t_timers->tt_persist)) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_persist); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); /* * Persistence timer into zero window. * Force a byte to be output, if possible. */ TCPSTAT_INC(tcps_persisttimeo); /* * Hack: if the peer is dead/unreachable, we do not * time out if the window is closed. After a full * backoff, drop the connection if the idle time * (no responses to probes) reaches the maximum * backoff that we would use if retransmitting. */ if (tp->t_rxtshift == TCP_MAXRXTSHIFT && (ticks - tp->t_rcvtime >= tcp_maxpersistidle || ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { TCPSTAT_INC(tcps_persistdrop); if (tcp_inpinfo_lock_add(inp)) { tcp_inpinfo_lock_del(inp, tp); goto out; } tp = tcp_drop(tp, ETIMEDOUT); tcp_inpinfo_lock_del(inp, tp); goto out; } /* * If the user has closed the socket then drop a persisting * connection after a much reduced timeout. */ if (tp->t_state > TCPS_CLOSE_WAIT && (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { TCPSTAT_INC(tcps_persistdrop); if (tcp_inpinfo_lock_add(inp)) { tcp_inpinfo_lock_del(inp, tp); goto out; } tp = tcp_drop(tp, ETIMEDOUT); tcp_inpinfo_lock_del(inp, tp); goto out; } tcp_setpersist(tp); tp->t_flags |= TF_FORCEDATA; (void) tp->t_fb->tfb_tcp_output(tp); tp->t_flags &= ~TF_FORCEDATA; #ifdef TCPDEBUG if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); INP_WUNLOCK(inp); out: CURVNET_RESTORE(); } void tcp_timer_rexmt(void * xtp) { struct tcpcb *tp = xtp; CURVNET_SET(tp->t_vnet); int rexmt; struct inpcb *inp; #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_rexmt) || !callout_active(&tp->t_timers->tt_rexmt)) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_rexmt); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); tcp_free_sackholes(tp); if (tp->t_fb->tfb_tcp_rexmit_tmr) { /* The stack has a timer action too. */ (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp); } /* * Retransmission timer went off. Message has not * been acked within retransmit interval. Back off * to a longer retransmit interval and retransmit one segment. */ if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; TCPSTAT_INC(tcps_timeoutdrop); if (tcp_inpinfo_lock_add(inp)) { tcp_inpinfo_lock_del(inp, tp); goto out; } tp = tcp_drop(tp, tp->t_softerror ? tp->t_softerror : ETIMEDOUT); tcp_inpinfo_lock_del(inp, tp); goto out; } if (tp->t_state == TCPS_SYN_SENT) { /* * If the SYN was retransmitted, indicate CWND to be * limited to 1 segment in cc_conn_init(). */ tp->snd_cwnd = 1; } else if (tp->t_rxtshift == 1) { /* * first retransmit; record ssthresh and cwnd so they can * be recovered if this turns out to be a "bad" retransmit. * A retransmit is considered "bad" if an ACK for this * segment is received within RTT/2 interval; the assumption * here is that the ACK was already in flight. See * "On Estimating End-to-End Network Path Properties" by * Allman and Paxson for more details. */ tp->snd_cwnd_prev = tp->snd_cwnd; tp->snd_ssthresh_prev = tp->snd_ssthresh; tp->snd_recover_prev = tp->snd_recover; if (IN_FASTRECOVERY(tp->t_flags)) tp->t_flags |= TF_WASFRECOVERY; else tp->t_flags &= ~TF_WASFRECOVERY; if (IN_CONGRECOVERY(tp->t_flags)) tp->t_flags |= TF_WASCRECOVERY; else tp->t_flags &= ~TF_WASCRECOVERY; tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); tp->t_flags |= TF_PREVVALID; } else tp->t_flags &= ~TF_PREVVALID; TCPSTAT_INC(tcps_rexmttimeo); if ((tp->t_state == TCPS_SYN_SENT) || (tp->t_state == TCPS_SYN_RECEIVED)) rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift]; else rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX); /* * We enter the path for PLMTUD if connection is established or, if * connection is FIN_WAIT_1 status, reason for the last is that if * amount of data we send is very small, we could send it in couple of * packets and process straight to FIN. In that case we won't catch * ESTABLISHED state. */ if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) || (tp->t_state == TCPS_FIN_WAIT_1))) { #ifdef INET6 int isipv6; #endif /* * Idea here is that at each stage of mtu probe (usually, 1448 * -> 1188 -> 524) should be given 2 chances to recover before * further clamping down. 'tp->t_rxtshift % 2 == 0' should * take care of that. */ if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && tp->t_rxtshift % 2 == 0)) { /* * Enter Path MTU Black-hole Detection mechanism: * - Disable Path MTU Discovery (IP "DF" bit). * - Reduce MTU to lower value than what we * negotiated with peer. */ if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { /* Record that we may have found a black hole. */ tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; /* Keep track of previous MSS. */ tp->t_pmtud_saved_maxseg = tp->t_maxseg; } /* * Reduce the MSS to blackhole value or to the default * in an attempt to retransmit. */ #ifdef INET6 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; if (isipv6 && tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { /* Use the sysctl tuneable blackhole MSS. */ tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; TCPSTAT_INC(tcps_pmtud_blackhole_activated); } else if (isipv6) { /* Use the default MSS. */ tp->t_maxseg = V_tcp_v6mssdflt; /* * Disable Path MTU Discovery when we switch to * minmss. */ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { /* Use the sysctl tuneable blackhole MSS. */ tp->t_maxseg = V_tcp_pmtud_blackhole_mss; TCPSTAT_INC(tcps_pmtud_blackhole_activated); } else { /* Use the default MSS. */ tp->t_maxseg = V_tcp_mssdflt; /* * Disable Path MTU Discovery when we switch to * minmss. */ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); } #endif /* * Reset the slow-start flight size * as it may depend on the new MSS. */ if (CC_ALGO(tp)->conn_init != NULL) CC_ALGO(tp)->conn_init(tp->ccv); } else { /* * If further retransmissions are still unsuccessful * with a lowered MTU, maybe this isn't a blackhole and * we restore the previous MSS and blackhole detection * flags. * The limit '6' is determined by giving each probe * stage (1448, 1188, 524) 2 chances to recover. */ if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && (tp->t_rxtshift >= 6)) { tp->t_flags2 |= TF2_PLPMTU_PMTUD; tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; tp->t_maxseg = tp->t_pmtud_saved_maxseg; TCPSTAT_INC(tcps_pmtud_blackhole_failed); /* * Reset the slow-start flight size as it * may depend on the new MSS. */ if (CC_ALGO(tp)->conn_init != NULL) CC_ALGO(tp)->conn_init(tp->ccv); } } } /* * Disable RFC1323 and SACK if we haven't got any response to * our third SYN to work-around some broken terminal servers * (most of which have hopefully been retired) that have bad VJ * header compression code which trashes TCP segments containing * unknown-to-them TCP options. */ if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == 3)) tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); /* * If we backed off this far, notify the L3 protocol that we're having * connection problems. */ if (tp->t_rxtshift > TCP_RTT_INVALIDATE) { #ifdef INET6 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) in6_losing(tp->t_inpcb); else #endif in_losing(tp->t_inpcb); } tp->snd_nxt = tp->snd_una; tp->snd_recover = tp->snd_max; /* * Force a segment to be sent. */ tp->t_flags |= TF_ACKNOW; /* * If timing a segment in this window, stop the timer. */ tp->t_rtttime = 0; cc_cong_signal(tp, NULL, CC_RTO); (void) tp->t_fb->tfb_tcp_output(tp); #ifdef TCPDEBUG if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); INP_WUNLOCK(inp); out: CURVNET_RESTORE(); } void tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta) { struct callout *t_callout; timeout_t *f_callout; struct inpcb *inp = tp->t_inpcb; int cpu = inp_to_cpuid(inp); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) return; #endif if (tp->t_timers->tt_flags & TT_STOPPED) return; switch (timer_type) { case TT_DELACK: t_callout = &tp->t_timers->tt_delack; f_callout = tcp_timer_delack; break; case TT_REXMT: t_callout = &tp->t_timers->tt_rexmt; f_callout = tcp_timer_rexmt; break; case TT_PERSIST: t_callout = &tp->t_timers->tt_persist; f_callout = tcp_timer_persist; break; case TT_KEEP: t_callout = &tp->t_timers->tt_keep; f_callout = tcp_timer_keep; break; case TT_2MSL: t_callout = &tp->t_timers->tt_2msl; f_callout = tcp_timer_2msl; break; default: if (tp->t_fb->tfb_tcp_timer_activate) { tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta); return; } panic("tp %p bad timer_type %#x", tp, timer_type); } if (delta == 0) { callout_stop(t_callout); } else { callout_reset_on(t_callout, delta, f_callout, tp, cpu); } } int tcp_timer_active(struct tcpcb *tp, uint32_t timer_type) { struct callout *t_callout; switch (timer_type) { case TT_DELACK: t_callout = &tp->t_timers->tt_delack; break; case TT_REXMT: t_callout = &tp->t_timers->tt_rexmt; break; case TT_PERSIST: t_callout = &tp->t_timers->tt_persist; break; case TT_KEEP: t_callout = &tp->t_timers->tt_keep; break; case TT_2MSL: t_callout = &tp->t_timers->tt_2msl; break; default: if (tp->t_fb->tfb_tcp_timer_active) { return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type)); } panic("tp %p bad timer_type %#x", tp, timer_type); } return callout_active(t_callout); } void tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type) { struct callout *t_callout; tp->t_timers->tt_flags |= TT_STOPPED; switch (timer_type) { case TT_DELACK: t_callout = &tp->t_timers->tt_delack; break; case TT_REXMT: t_callout = &tp->t_timers->tt_rexmt; break; case TT_PERSIST: t_callout = &tp->t_timers->tt_persist; break; case TT_KEEP: t_callout = &tp->t_timers->tt_keep; break; case TT_2MSL: t_callout = &tp->t_timers->tt_2msl; break; default: if (tp->t_fb->tfb_tcp_timer_stop) { /* * XXXrrs we need to look at this with the * stop case below (flags). */ tp->t_fb->tfb_tcp_timer_stop(tp, timer_type); return; } panic("tp %p bad timer_type %#x", tp, timer_type); } if (callout_async_drain(t_callout, tcp_timer_discard) == 0) { /* * Can't stop the callout, defer tcpcb actual deletion * to the last one. We do this using the async drain * function and incrementing the count in */ tp->t_timers->tt_draincnt++; } } Index: head/sys/netinet/tcp_timer.h =================================================================== --- head/sys/netinet/tcp_timer.h (revision 328607) +++ head/sys/netinet/tcp_timer.h (revision 328608) @@ -1,225 +1,226 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_timer.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #ifndef _NETINET_TCP_TIMER_H_ #define _NETINET_TCP_TIMER_H_ /* * The TCPT_REXMT timer is used to force retransmissions. * The TCP has the TCPT_REXMT timer set whenever segments * have been sent for which ACKs are expected but not yet * received. If an ACK is received which advances tp->snd_una, * then the retransmit timer is cleared (if there are no more * outstanding segments) or reset to the base value (if there * are more ACKs expected). Whenever the retransmit timer goes off, * we retransmit one unacknowledged segment, and do a backoff * on the retransmit timer. * * The TCPT_PERSIST timer is used to keep window size information * flowing even if the window goes shut. If all previous transmissions * have been acknowledged (so that there are no retransmissions in progress), * and the window is too small to bother sending anything, then we start * the TCPT_PERSIST timer. When it expires, if the window is nonzero, * we go to transmit state. Otherwise, at intervals send a single byte * into the peer's window to force him to update our window information. * We do this at most as often as TCPT_PERSMIN time intervals, * but no more frequently than the current estimate of round-trip * packet time. The TCPT_PERSIST timer is cleared whenever we receive * a window update from the peer. * * The TCPT_KEEP timer is used to keep connections alive. If an * connection is idle (no segments received) for TCPTV_KEEP_INIT amount of time, * but not yet established, then we drop the connection. Once the connection * is established, if the connection is idle for TCPTV_KEEP_IDLE time * (and keepalives have been enabled on the socket), we begin to probe * the connection. We force the peer to send us a segment by sending: * * This segment is (deliberately) outside the window, and should elicit * an ack segment in response from the peer. If, despite the TCPT_KEEP * initiated segments we cannot elicit a response from a peer in TCPT_MAXIDLE * amount of time probing, then we drop the connection. */ /* * Time constants. */ #define TCPTV_MSL ( 30*hz) /* max seg lifetime (hah!) */ #define TCPTV_SRTTBASE 0 /* base roundtrip time; if 0, no idea yet */ #define TCPTV_RTOBASE ( 3*hz) /* assumed RTO if no info */ #define TCPTV_PERSMIN ( 5*hz) /* minimum persist interval */ #define TCPTV_PERSMAX ( 60*hz) /* maximum persist interval */ #define TCPTV_KEEP_INIT ( 75*hz) /* initial connect keepalive */ #define TCPTV_KEEP_IDLE (120*60*hz) /* dflt time before probing */ #define TCPTV_KEEPINTVL ( 75*hz) /* default probe interval */ #define TCPTV_KEEPCNT 8 /* max probes before drop */ #define TCPTV_FINWAIT2_TIMEOUT (60*hz) /* FIN_WAIT_2 timeout if no receiver */ /* * Minimum retransmit timer is 3 ticks, for algorithmic stability. * TCPT_RANGESET() will add another TCPTV_CPU_VAR to deal with * the expected worst-case processing variances by the kernels * representing the end points. Such variances do not always show * up in the srtt because the timestamp is often calculated at * the interface rather then at the TCP layer. This value is * typically 50ms. However, it is also possible that delayed * acks (typically 100ms) could create issues so we set the slop * to 200ms to try to cover it. Note that, properly speaking, * delayed-acks should not create a major issue for interactive * environments which 'P'ush the last segment, at least as * long as implementations do the required 'at least one ack * for every two packets' for the non-interactive streaming case. * (maybe the RTO calculation should use 2*RTT instead of RTT * to handle the ack-every-other-packet case). * * The prior minimum of 1*hz (1 second) badly breaks throughput on any * networks faster then a modem that has minor (e.g. 1%) packet loss. */ #define TCPTV_MIN ( hz/33 ) /* minimum allowable value */ #define TCPTV_CPU_VAR ( hz/5 ) /* cpu variance allowed (200ms) */ #define TCPTV_REXMTMAX ( 64*hz) /* max allowable REXMT value */ #define TCPTV_TWTRUNC 8 /* RTO factor to truncate TW */ #define TCP_LINGERTIME 120 /* linger at most 2 minutes */ #define TCP_MAXRXTSHIFT 12 /* maximum retransmits */ #define TCPTV_DELACK ( hz/10 ) /* 100ms timeout */ /* * If we exceed this number of retransmits for a single segment, we'll consider * the current srtt measurement no longer valid and will recalculate from * scratch starting with the next ACK. */ #define TCP_RTT_INVALIDATE (TCP_MAXRXTSHIFT / 4) #ifdef TCPTIMERS static const char *tcptimers[] = { "REXMT", "PERSIST", "KEEP", "2MSL", "DELACK" }; #endif /* * Force a time value to be in a certain range. */ #define TCPT_RANGESET(tv, value, tvmin, tvmax) do { \ (tv) = (value) + tcp_rexmit_slop; \ if ((u_long)(tv) < (u_long)(tvmin)) \ (tv) = (tvmin); \ if ((u_long)(tv) > (u_long)(tvmax)) \ (tv) = (tvmax); \ } while(0) #ifdef _KERNEL struct xtcp_timer; struct tcp_timer { struct callout tt_rexmt; /* retransmit timer */ struct callout tt_persist; /* retransmit persistence */ struct callout tt_keep; /* keepalive */ struct callout tt_2msl; /* 2*msl TIME_WAIT timer */ struct callout tt_delack; /* delayed ACK timer */ uint32_t tt_flags; /* Timers flags */ uint32_t tt_draincnt; /* Count being drained */ }; /* * Flags for the tt_flags field. */ #define TT_DELACK 0x0001 #define TT_REXMT 0x0002 #define TT_PERSIST 0x0004 #define TT_KEEP 0x0008 #define TT_2MSL 0x0010 #define TT_MASK (TT_DELACK|TT_REXMT|TT_PERSIST|TT_KEEP|TT_2MSL) #define TT_DELACK_RST 0x0100 #define TT_REXMT_RST 0x0200 #define TT_PERSIST_RST 0x0400 #define TT_KEEP_RST 0x0800 #define TT_2MSL_RST 0x1000 #define TT_STOPPED 0x00010000 #define TP_KEEPINIT(tp) ((tp)->t_keepinit ? (tp)->t_keepinit : tcp_keepinit) #define TP_KEEPIDLE(tp) ((tp)->t_keepidle ? (tp)->t_keepidle : tcp_keepidle) #define TP_KEEPINTVL(tp) ((tp)->t_keepintvl ? (tp)->t_keepintvl : tcp_keepintvl) #define TP_KEEPCNT(tp) ((tp)->t_keepcnt ? (tp)->t_keepcnt : tcp_keepcnt) #define TP_MAXIDLE(tp) (TP_KEEPCNT(tp) * TP_KEEPINTVL(tp)) extern int tcp_persmin; /* minimum persist interval */ extern int tcp_persmax; /* maximum persist interval */ extern int tcp_keepinit; /* time to establish connection */ extern int tcp_keepidle; /* time before keepalive probes begin */ extern int tcp_keepintvl; /* time between keepalive probes */ extern int tcp_keepcnt; /* number of keepalives */ extern int tcp_delacktime; /* time before sending a delayed ACK */ extern int tcp_maxpersistidle; extern int tcp_rexmit_min; extern int tcp_rexmit_slop; extern int tcp_msl; extern int tcp_ttl; /* time to live for TCP segs */ extern int tcp_backoff[]; extern int tcp_syn_backoff[]; +extern int tcp_always_keepalive; extern int tcp_finwait2_timeout; extern int tcp_fast_finwait2_recycle; VNET_DECLARE(int, tcp_pmtud_blackhole_detect); #define V_tcp_pmtud_blackhole_detect VNET(tcp_pmtud_blackhole_detect) VNET_DECLARE(int, tcp_pmtud_blackhole_mss); #define V_tcp_pmtud_blackhole_mss VNET(tcp_pmtud_blackhole_mss) VNET_DECLARE(int, tcp_v6pmtud_blackhole_mss); #define V_tcp_v6pmtud_blackhole_mss VNET(tcp_v6pmtud_blackhole_mss) int tcp_inpinfo_lock_add(struct inpcb *inp); void tcp_inpinfo_lock_del(struct inpcb *inp, struct tcpcb *tp); void tcp_timer_init(void); void tcp_timer_2msl(void *xtp); void tcp_timer_discard(void *); struct tcptw * tcp_tw_2msl_scan(int reuse); /* XXX temporary? */ void tcp_timer_keep(void *xtp); void tcp_timer_persist(void *xtp); void tcp_timer_rexmt(void *xtp); void tcp_timer_delack(void *xtp); #endif /* _KERNEL */ #endif /* !_NETINET_TCP_TIMER_H_ */