Index: stable/10/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c =================================================================== --- stable/10/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c (revision 330302) +++ stable/10/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c (revision 330303) @@ -1,1811 +1,1810 @@ /*- * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #ifdef TCP_OFFLOAD #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include #include #include #include #include #include "cxgb_include.h" #include "ulp/tom/cxgb_l2t.h" #include "ulp/tom/cxgb_tom.h" #include "ulp/tom/cxgb_toepcb.h" VNET_DECLARE(int, tcp_do_autosndbuf); #define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) VNET_DECLARE(int, tcp_autosndbuf_inc); #define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc) VNET_DECLARE(int, tcp_autosndbuf_max); #define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) VNET_DECLARE(int, tcp_do_autorcvbuf); #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) VNET_DECLARE(int, tcp_autorcvbuf_inc); #define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) VNET_DECLARE(int, tcp_autorcvbuf_max); #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) -extern int always_keepalive; /* * For ULP connections HW may add headers, e.g., for digests, that aren't part * of the messages sent by the host but that are part of the TCP payload and * therefore consume TCP sequence space. Tx connection parameters that * operate in TCP sequence space are affected by the HW additions and need to * compensate for them to accurately track TCP sequence numbers. This array * contains the compensating extra lengths for ULP packets. It is indexed by * a packet's ULP submode. */ const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8}; /* * Max receive window supported by HW in bytes. Only a small part of it can * be set through option0, the rest needs to be set through RX_DATA_ACK. */ #define MAX_RCV_WND ((1U << 27) - 1) /* * Min receive window. We want it to be large enough to accommodate receive * coalescing, handle jumbo frames, and not trigger sender SWS avoidance. */ #define MIN_RCV_WND (24 * 1024U) #define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS) static void t3_release_offload_resources(struct toepcb *); static void send_reset(struct toepcb *toep); /* * Called after the last CPL for the toepcb has been received. * * The inp must be wlocked on entry and is unlocked (or maybe destroyed) by the * time this function exits. */ static int toepcb_release(struct toepcb *toep) { struct inpcb *inp = toep->tp_inp; struct toedev *tod = toep->tp_tod; struct tom_data *td = t3_tomdata(tod); int rc; INP_WLOCK_ASSERT(inp); KASSERT(!(toep->tp_flags & TP_CPL_DONE), ("%s: double release?", __func__)); CTR2(KTR_CXGB, "%s: tid %d", __func__, toep->tp_tid); toep->tp_flags |= TP_CPL_DONE; toep->tp_inp = NULL; mtx_lock(&td->toep_list_lock); TAILQ_REMOVE(&td->toep_list, toep, link); mtx_unlock(&td->toep_list_lock); if (!(toep->tp_flags & TP_ATTACHED)) t3_release_offload_resources(toep); rc = in_pcbrele_wlocked(inp); if (!rc) INP_WUNLOCK(inp); return (rc); } /* * One sided detach. The tcpcb is going away and we need to unhook the toepcb * hanging off it. If the TOE driver is also done with the toepcb we'll release * all offload resources. */ static void toepcb_detach(struct inpcb *inp) { struct toepcb *toep; struct tcpcb *tp; KASSERT(inp, ("%s: inp is NULL", __func__)); INP_WLOCK_ASSERT(inp); tp = intotcpcb(inp); toep = tp->t_toe; KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); KASSERT(toep->tp_flags & TP_ATTACHED, ("%s: not attached", __func__)); CTR6(KTR_CXGB, "%s: %s %u, toep %p, inp %p, tp %p", __func__, tp->t_state == TCPS_SYN_SENT ? "atid" : "tid", toep->tp_tid, toep, inp, tp); tp->t_toe = NULL; tp->t_flags &= ~TF_TOE; toep->tp_flags &= ~TP_ATTACHED; if (toep->tp_flags & TP_CPL_DONE) t3_release_offload_resources(toep); } void t3_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp) { toepcb_detach(tp->t_inpcb); } static int alloc_atid(struct tid_info *t, void *ctx) { int atid = -1; mtx_lock(&t->atid_lock); if (t->afree) { union active_open_entry *p = t->afree; atid = (p - t->atid_tab) + t->atid_base; t->afree = p->next; p->ctx = ctx; t->atids_in_use++; } mtx_unlock(&t->atid_lock); return (atid); } static void free_atid(struct tid_info *t, int atid) { union active_open_entry *p = atid2entry(t, atid); mtx_lock(&t->atid_lock); p->next = t->afree; t->afree = p; t->atids_in_use--; mtx_unlock(&t->atid_lock); } void insert_tid(struct tom_data *td, void *ctx, unsigned int tid) { struct tid_info *t = &td->tid_maps; t->tid_tab[tid] = ctx; atomic_add_int(&t->tids_in_use, 1); } void update_tid(struct tom_data *td, void *ctx, unsigned int tid) { struct tid_info *t = &td->tid_maps; t->tid_tab[tid] = ctx; } void remove_tid(struct tom_data *td, unsigned int tid) { struct tid_info *t = &td->tid_maps; t->tid_tab[tid] = NULL; atomic_add_int(&t->tids_in_use, -1); } /* use ctx as a next pointer in the tid release list */ void queue_tid_release(struct toedev *tod, unsigned int tid) { struct tom_data *td = t3_tomdata(tod); void **p = &td->tid_maps.tid_tab[tid]; struct adapter *sc = tod->tod_softc; mtx_lock(&td->tid_release_lock); *p = td->tid_release_list; td->tid_release_list = p; if (!*p) taskqueue_enqueue(sc->tq, &td->tid_release_task); mtx_unlock(&td->tid_release_lock); } /* * Populate a TID_RELEASE WR. */ static inline void mk_tid_release(struct cpl_tid_release *cpl, unsigned int tid) { cpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); OPCODE_TID(cpl) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid)); } void release_tid(struct toedev *tod, unsigned int tid, int qset) { struct tom_data *td = t3_tomdata(tod); struct adapter *sc = tod->tod_softc; struct mbuf *m; struct cpl_tid_release *cpl; #ifdef INVARIANTS struct tid_info *t = &td->tid_maps; #endif KASSERT(tid >= 0 && tid < t->ntids, ("%s: tid=%d, ntids=%d", __func__, tid, t->ntids)); m = M_GETHDR_OFLD(qset, CPL_PRIORITY_CONTROL, cpl); if (m) { mk_tid_release(cpl, tid); t3_offload_tx(sc, m); remove_tid(td, tid); } else queue_tid_release(tod, tid); } void t3_process_tid_release_list(void *data, int pending) { struct mbuf *m; struct tom_data *td = data; struct adapter *sc = td->tod.tod_softc; mtx_lock(&td->tid_release_lock); while (td->tid_release_list) { void **p = td->tid_release_list; unsigned int tid = p - td->tid_maps.tid_tab; struct cpl_tid_release *cpl; td->tid_release_list = (void **)*p; m = M_GETHDR_OFLD(0, CPL_PRIORITY_CONTROL, cpl); /* qs 0 here */ if (m == NULL) break; /* XXX: who reschedules the release task? */ mtx_unlock(&td->tid_release_lock); mk_tid_release(cpl, tid); t3_offload_tx(sc, m); remove_tid(td, tid); mtx_lock(&td->tid_release_lock); } mtx_unlock(&td->tid_release_lock); } static void close_conn(struct adapter *sc, struct toepcb *toep) { struct mbuf *m; struct cpl_close_con_req *req; if (toep->tp_flags & TP_FIN_SENT) return; m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_DATA, req); if (m == NULL) CXGB_UNIMPLEMENTED(); req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON)); req->wr.wrh_lo = htonl(V_WR_TID(toep->tp_tid)); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, toep->tp_tid)); req->rsvd = 0; toep->tp_flags |= TP_FIN_SENT; t3_offload_tx(sc, m); } static inline void make_tx_data_wr(struct socket *so, struct tx_data_wr *req, int len, struct mbuf *tail) { struct tcpcb *tp = so_sototcpcb(so); struct toepcb *toep = tp->t_toe; struct sockbuf *snd; inp_lock_assert(tp->t_inpcb); snd = so_sockbuf_snd(so); req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); req->wr.wrh_lo = htonl(V_WR_TID(toep->tp_tid)); /* len includes the length of any HW ULP additions */ req->len = htonl(len); req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx)); /* V_TX_ULP_SUBMODE sets both the mode and submode */ req->flags = htonl(V_TX_ULP_SUBMODE(toep->tp_ulp_mode) | V_TX_URG(0) | V_TX_SHOVE(!(tp->t_flags & TF_MORETOCOME) && (tail ? 0 : 1))); req->sndseq = htonl(tp->snd_nxt); if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) { struct adapter *sc = toep->tp_tod->tod_softc; int cpu_idx = sc->rrss_map[toep->tp_qset]; req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT | V_TX_CPU_IDX(cpu_idx)); /* Sendbuffer is in units of 32KB. */ if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE) req->param |= htonl(V_TX_SNDBUF(VNET(tcp_autosndbuf_max) >> 15)); else req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15)); toep->tp_flags |= TP_DATASENT; } } /* * TOM_XXX_DUPLICATION sgl_len, calc_tx_descs, calc_tx_descs_ofld, mbuf_wrs, etc. * TOM_XXX_MOVE to some common header file. */ /* * IMM_LEN: # of bytes that can be tx'd as immediate data. There are 16 flits * in a tx desc; subtract 3 for tx_data_wr (including the WR header), and 1 more * for the second gen bit flit. This leaves us with 12 flits. * * descs_to_sgllen: # of SGL entries that can fit into the given # of tx descs. * The first desc has a tx_data_wr (which includes the WR header), the rest have * the WR header only. All descs have the second gen bit flit. * * sgllen_to_descs: # of tx descs used up by an sgl of given length. The first * desc has a tx_data_wr (which includes the WR header), the rest have the WR * header only. All descs have the second gen bit flit. * * flits_to_sgllen: # of SGL entries that can be fit in the given # of flits. * */ #define IMM_LEN 96 static int descs_to_sgllen[TX_MAX_DESC + 1] = {0, 8, 17, 26, 35}; static int sgllen_to_descs[TX_MAX_SEGS] = { 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, /* 0 - 9 */ 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, /* 10 - 19 */ 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, /* 20 - 29 */ 4, 4, 4, 4, 4, 4 /* 30 - 35 */ }; #if 0 static int flits_to_sgllen[TX_DESC_FLITS + 1] = { 0, 0, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9, 10, 10 }; #endif #if SGE_NUM_GENBITS != 2 #error "SGE_NUM_GENBITS really must be 2" #endif int t3_push_frames(struct socket *so, int req_completion) { struct tcpcb *tp = so_sototcpcb(so); struct toepcb *toep = tp->t_toe; struct mbuf *m0, *sndptr, *m; struct toedev *tod = toep->tp_tod; struct adapter *sc = tod->tod_softc; int bytes, ndesc, total_bytes = 0, mlen; struct sockbuf *snd; struct sglist *sgl; struct ofld_hdr *oh; caddr_t dst; struct tx_data_wr *wr; inp_lock_assert(tp->t_inpcb); snd = so_sockbuf_snd(so); SOCKBUF_LOCK(snd); /* * Autosize the send buffer. */ if (snd->sb_flags & SB_AUTOSIZE && VNET(tcp_do_autosndbuf)) { if (snd->sb_cc >= (snd->sb_hiwat / 8 * 7) && snd->sb_cc < VNET(tcp_autosndbuf_max)) { if (!sbreserve_locked(snd, min(snd->sb_hiwat + VNET(tcp_autosndbuf_inc), VNET(tcp_autosndbuf_max)), so, curthread)) snd->sb_flags &= ~SB_AUTOSIZE; } } if (toep->tp_m_last && toep->tp_m_last == snd->sb_sndptr) sndptr = toep->tp_m_last->m_next; else sndptr = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb; /* Nothing to send or no WRs available for sending data */ if (toep->tp_wr_avail == 0 || sndptr == NULL) goto out; /* Something to send and at least 1 WR available */ while (toep->tp_wr_avail && sndptr != NULL) { m0 = m_gethdr(M_NOWAIT, MT_DATA); if (m0 == NULL) break; oh = mtod(m0, struct ofld_hdr *); wr = (void *)(oh + 1); dst = (void *)(wr + 1); m0->m_pkthdr.len = m0->m_len = sizeof(*oh) + sizeof(*wr); oh->flags = V_HDR_CTRL(CPL_PRIORITY_DATA) | F_HDR_DF | V_HDR_QSET(toep->tp_qset); /* * Try to construct an immediate data WR if possible. Stuff as * much data into it as possible, one whole mbuf at a time. */ mlen = sndptr->m_len; ndesc = bytes = 0; while (mlen <= IMM_LEN - bytes) { bcopy(sndptr->m_data, dst, mlen); bytes += mlen; dst += mlen; if (!(sndptr = sndptr->m_next)) break; mlen = sndptr->m_len; } if (bytes) { /* Was able to fit 'bytes' bytes in an immediate WR */ ndesc = 1; make_tx_data_wr(so, wr, bytes, sndptr); m0->m_len += bytes; m0->m_pkthdr.len = m0->m_len; } else { int wr_avail = min(toep->tp_wr_avail, TX_MAX_DESC); /* Need to make an SGL */ sgl = sglist_alloc(descs_to_sgllen[wr_avail], M_NOWAIT); if (sgl == NULL) break; for (m = sndptr; m != NULL; m = m->m_next) { if ((mlen = m->m_len) > 0) { if (sglist_append(sgl, m->m_data, mlen)) break; } bytes += mlen; } sndptr = m; if (bytes == 0) { sglist_free(sgl); break; } ndesc = sgllen_to_descs[sgl->sg_nseg]; oh->flags |= F_HDR_SGL; oh->sgl = sgl; make_tx_data_wr(so, wr, bytes, sndptr); } oh->flags |= V_HDR_NDESC(ndesc); oh->plen = bytes; snd->sb_sndptr = sndptr; snd->sb_sndptroff += bytes; if (sndptr == NULL) { snd->sb_sndptr = snd->sb_mbtail; snd->sb_sndptroff -= snd->sb_mbtail->m_len; toep->tp_m_last = snd->sb_mbtail; } else toep->tp_m_last = NULL; total_bytes += bytes; toep->tp_wr_avail -= ndesc; toep->tp_wr_unacked += ndesc; if ((req_completion && toep->tp_wr_unacked == ndesc) || toep->tp_wr_unacked >= toep->tp_wr_max / 2) { wr->wr.wrh_hi |= htonl(F_WR_COMPL); toep->tp_wr_unacked = 0; } enqueue_wr(toep, m0); l2t_send(sc, m0, toep->tp_l2t); } out: SOCKBUF_UNLOCK(snd); if (sndptr == NULL && (toep->tp_flags & TP_SEND_FIN)) close_conn(sc, toep); return (total_bytes); } static int send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits) { struct mbuf *m; struct cpl_rx_data_ack *req; uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_CONTROL, req); if (m == NULL) return (0); req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); req->wr.wrh_lo = 0; OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); req->credit_dack = htonl(dack | V_RX_CREDITS(credits)); t3_offload_tx(sc, m); return (credits); } void t3_rcvd(struct toedev *tod, struct tcpcb *tp) { struct adapter *sc = tod->tod_softc; struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; struct sockbuf *so_rcv = &so->so_rcv; struct toepcb *toep = tp->t_toe; int must_send; INP_WLOCK_ASSERT(inp); SOCKBUF_LOCK(so_rcv); KASSERT(toep->tp_enqueued >= so_rcv->sb_cc, ("%s: so_rcv->sb_cc > enqueued", __func__)); toep->tp_rx_credits += toep->tp_enqueued - so_rcv->sb_cc; toep->tp_enqueued = so_rcv->sb_cc; SOCKBUF_UNLOCK(so_rcv); must_send = toep->tp_rx_credits + 16384 >= tp->rcv_wnd; if (must_send || toep->tp_rx_credits >= 15 * 1024) { int credits; credits = send_rx_credits(sc, toep, toep->tp_rx_credits); toep->tp_rx_credits -= credits; tp->rcv_wnd += credits; tp->rcv_adv += credits; } } static int do_rx_urg_notify(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { struct adapter *sc = qs->adap; struct tom_data *td = sc->tom_softc; struct cpl_rx_urg_notify *hdr = mtod(m, void *); unsigned int tid = GET_TID(hdr); struct toepcb *toep = lookup_tid(&td->tid_maps, tid); log(LOG_ERR, "%s: tid %u inp %p", __func__, tid, toep->tp_inp); m_freem(m); return (0); } int t3_send_fin(struct toedev *tod, struct tcpcb *tp) { struct toepcb *toep = tp->t_toe; struct inpcb *inp = tp->t_inpcb; struct socket *so = inp_inpcbtosocket(inp); #if defined(KTR) unsigned int tid = toep->tp_tid; #endif INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); CTR4(KTR_CXGB, "%s: tid %d, toep %p, flags %x", __func__, tid, toep, toep->tp_flags); toep->tp_flags |= TP_SEND_FIN; t3_push_frames(so, 1); return (0); } int t3_tod_output(struct toedev *tod, struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; t3_push_frames(so, 1); return (0); } /* What mtu_idx to use, given a 4-tuple and/or an MSS cap */ int find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, int pmss) { unsigned short *mtus = &sc->params.mtus[0]; int i = 0, mss; KASSERT(inc != NULL || pmss > 0, ("%s: at least one of inc/pmss must be specified", __func__)); mss = inc ? tcp_mssopt(inc) : pmss; if (pmss > 0 && mss > pmss) mss = pmss; while (i < NMTUS - 1 && mtus[i + 1] <= mss + 40) ++i; return (i); } static inline void purge_wr_queue(struct toepcb *toep) { struct mbuf *m; struct ofld_hdr *oh; while ((m = mbufq_dequeue(&toep->wr_list)) != NULL) { oh = mtod(m, struct ofld_hdr *); if (oh->flags & F_HDR_SGL) sglist_free(oh->sgl); m_freem(m); } } /* * Release cxgb(4) and T3 resources held by an offload connection (TID, L2T * entry, etc.) */ static void t3_release_offload_resources(struct toepcb *toep) { struct toedev *tod = toep->tp_tod; struct tom_data *td = t3_tomdata(tod); /* * The TOM explicitly detaches its toepcb from the system's inp before * it releases the offload resources. */ if (toep->tp_inp) { panic("%s: inp %p still attached to toepcb %p", __func__, toep->tp_inp, toep); } if (toep->tp_wr_avail != toep->tp_wr_max) purge_wr_queue(toep); if (toep->tp_l2t) { l2t_release(td->l2t, toep->tp_l2t); toep->tp_l2t = NULL; } if (toep->tp_tid >= 0) release_tid(tod, toep->tp_tid, toep->tp_qset); toepcb_free(toep); } /* * Determine the receive window size for a socket. */ unsigned long select_rcv_wnd(struct socket *so) { unsigned long wnd; SOCKBUF_LOCK_ASSERT(&so->so_rcv); wnd = sbspace(&so->so_rcv); if (wnd < MIN_RCV_WND) wnd = MIN_RCV_WND; return min(wnd, MAX_RCV_WND); } int select_rcv_wscale(void) { int wscale = 0; unsigned long space = sb_max; if (space > MAX_RCV_WND) space = MAX_RCV_WND; while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space) wscale++; return (wscale); } /* * Set up the socket for TCP offload. */ void offload_socket(struct socket *so, struct toepcb *toep) { struct toedev *tod = toep->tp_tod; struct tom_data *td = t3_tomdata(tod); struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); INP_WLOCK_ASSERT(inp); /* Update socket */ SOCKBUF_LOCK(&so->so_snd); so_sockbuf_snd(so)->sb_flags |= SB_NOCOALESCE; SOCKBUF_UNLOCK(&so->so_snd); SOCKBUF_LOCK(&so->so_rcv); so_sockbuf_rcv(so)->sb_flags |= SB_NOCOALESCE; SOCKBUF_UNLOCK(&so->so_rcv); /* Update TCP PCB */ tp->tod = toep->tp_tod; tp->t_toe = toep; tp->t_flags |= TF_TOE; /* Install an extra hold on inp */ toep->tp_inp = inp; toep->tp_flags |= TP_ATTACHED; in_pcbref(inp); /* Add the TOE PCB to the active list */ mtx_lock(&td->toep_list_lock); TAILQ_INSERT_HEAD(&td->toep_list, toep, link); mtx_unlock(&td->toep_list_lock); } /* This is _not_ the normal way to "unoffload" a socket. */ void undo_offload_socket(struct socket *so) { struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct toepcb *toep = tp->t_toe; struct toedev *tod = toep->tp_tod; struct tom_data *td = t3_tomdata(tod); INP_WLOCK_ASSERT(inp); so_sockbuf_snd(so)->sb_flags &= ~SB_NOCOALESCE; so_sockbuf_rcv(so)->sb_flags &= ~SB_NOCOALESCE; tp->tod = NULL; tp->t_toe = NULL; tp->t_flags &= ~TF_TOE; toep->tp_inp = NULL; toep->tp_flags &= ~TP_ATTACHED; if (in_pcbrele_wlocked(inp)) panic("%s: inp freed.", __func__); mtx_lock(&td->toep_list_lock); TAILQ_REMOVE(&td->toep_list, toep, link); mtx_unlock(&td->toep_list_lock); } /* * Socket could be a listening socket, and we may not have a toepcb at all at * this time. */ uint32_t calc_opt0h(struct socket *so, int mtu_idx, int rscale, struct l2t_entry *e) { uint32_t opt0h = F_TCAM_BYPASS | V_WND_SCALE(rscale) | V_MSS_IDX(mtu_idx); if (so != NULL) { struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); - int keepalive = always_keepalive || + int keepalive = tcp_always_keepalive || so_options_get(so) & SO_KEEPALIVE; opt0h |= V_NAGLE((tp->t_flags & TF_NODELAY) == 0); opt0h |= V_KEEP_ALIVE(keepalive != 0); } if (e != NULL) opt0h |= V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx); return (htobe32(opt0h)); } uint32_t calc_opt0l(struct socket *so, int rcv_bufsize) { uint32_t opt0l = V_ULP_MODE(ULP_MODE_NONE) | V_RCV_BUFSIZ(rcv_bufsize); KASSERT(rcv_bufsize <= M_RCV_BUFSIZ, ("%s: rcv_bufsize (%d) is too high", __func__, rcv_bufsize)); if (so != NULL) /* optional because noone cares about IP TOS */ opt0l |= V_TOS(INP_TOS(sotoinpcb(so))); return (htobe32(opt0l)); } /* * Convert an ACT_OPEN_RPL status to an errno. */ static int act_open_rpl_status_to_errno(int status) { switch (status) { case CPL_ERR_CONN_RESET: return (ECONNREFUSED); case CPL_ERR_ARP_MISS: return (EHOSTUNREACH); case CPL_ERR_CONN_TIMEDOUT: return (ETIMEDOUT); case CPL_ERR_TCAM_FULL: return (EAGAIN); case CPL_ERR_CONN_EXIST: log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n"); return (EAGAIN); default: return (EIO); } } /* * Return whether a failed active open has allocated a TID */ static inline int act_open_has_tid(int status) { return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST && status != CPL_ERR_ARP_MISS; } /* * Active open failed. */ static int do_act_open_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { struct adapter *sc = qs->adap; struct tom_data *td = sc->tom_softc; struct toedev *tod = &td->tod; struct cpl_act_open_rpl *rpl = mtod(m, void *); unsigned int atid = G_TID(ntohl(rpl->atid)); struct toepcb *toep = lookup_atid(&td->tid_maps, atid); struct inpcb *inp = toep->tp_inp; int s = rpl->status, rc; CTR3(KTR_CXGB, "%s: atid %u, status %u ", __func__, atid, s); free_atid(&td->tid_maps, atid); toep->tp_tid = -1; if (act_open_has_tid(s)) queue_tid_release(tod, GET_TID(rpl)); rc = act_open_rpl_status_to_errno(s); if (rc != EAGAIN) INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); toe_connect_failed(tod, inp, rc); toepcb_release(toep); /* unlocks inp */ if (rc != EAGAIN) INP_INFO_RUNLOCK(&V_tcbinfo); m_freem(m); return (0); } /* * Send an active open request. * * State of affairs on entry: * soisconnecting (so_state |= SS_ISCONNECTING) * tcbinfo not locked (this has changed - used to be WLOCKed) * inp WLOCKed * tp->t_state = TCPS_SYN_SENT * rtalloc1, RT_UNLOCK on rt. */ int t3_connect(struct toedev *tod, struct socket *so, struct rtentry *rt, struct sockaddr *nam) { struct mbuf *m = NULL; struct l2t_entry *e = NULL; struct tom_data *td = t3_tomdata(tod); struct adapter *sc = tod->tod_softc; struct cpl_act_open_req *cpl; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct toepcb *toep; int atid = -1, mtu_idx, rscale, cpu_idx, qset; struct sockaddr *gw; struct ifnet *ifp = rt->rt_ifp; struct port_info *pi = ifp->if_softc; /* XXX wrong for VLAN etc. */ INP_WLOCK_ASSERT(inp); toep = toepcb_alloc(tod); if (toep == NULL) goto failed; atid = alloc_atid(&td->tid_maps, toep); if (atid < 0) goto failed; qset = pi->first_qset + (arc4random() % pi->nqsets); m = M_GETHDR_OFLD(qset, CPL_PRIORITY_CONTROL, cpl); if (m == NULL) goto failed; gw = rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway : nam; e = t3_l2t_get(pi, ifp, gw); if (e == NULL) goto failed; toep->tp_l2t = e; toep->tp_tid = atid; /* used to double check response */ toep->tp_qset = qset; SOCKBUF_LOCK(&so->so_rcv); /* opt0 rcv_bufsiz initially, assumes its normal meaning later */ toep->tp_rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ); SOCKBUF_UNLOCK(&so->so_rcv); offload_socket(so, toep); /* * The kernel sets request_r_scale based on sb_max whereas we need to * take hardware's MAX_RCV_WND into account too. This is normally a * no-op as MAX_RCV_WND is much larger than the default sb_max. */ if (tp->t_flags & TF_REQ_SCALE) rscale = tp->request_r_scale = select_rcv_wscale(); else rscale = 0; mtu_idx = find_best_mtu_idx(sc, &inp->inp_inc, 0); cpu_idx = sc->rrss_map[qset]; cpl->wr.wrh_hi = htobe32(V_WR_OP(FW_WROPCODE_FORWARD)); cpl->wr.wrh_lo = 0; OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid)); inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port, &cpl->peer_ip, &cpl->peer_port); cpl->opt0h = calc_opt0h(so, mtu_idx, rscale, e); cpl->opt0l = calc_opt0l(so, toep->tp_rx_credits); cpl->params = 0; cpl->opt2 = calc_opt2(cpu_idx); CTR5(KTR_CXGB, "%s: atid %u (%s), toep %p, inp %p", __func__, toep->tp_tid, tcpstates[tp->t_state], toep, inp); if (l2t_send(sc, m, e) == 0) return (0); undo_offload_socket(so); failed: CTR5(KTR_CXGB, "%s: FAILED, atid %d, toep %p, l2te %p, mbuf %p", __func__, atid, toep, e, m); if (atid >= 0) free_atid(&td->tid_maps, atid); if (e) l2t_release(td->l2t, e); if (toep) toepcb_free(toep); m_freem(m); return (ENOMEM); } /* * Send an ABORT_REQ message. Cannot fail. This routine makes sure we do not * send multiple ABORT_REQs for the same connection and also that we do not try * to send a message after the connection has closed. */ static void send_reset(struct toepcb *toep) { struct cpl_abort_req *req; unsigned int tid = toep->tp_tid; struct inpcb *inp = toep->tp_inp; struct socket *so = inp->inp_socket; struct tcpcb *tp = intotcpcb(inp); struct toedev *tod = toep->tp_tod; struct adapter *sc = tod->tod_softc; struct mbuf *m; INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); CTR4(KTR_CXGB, "%s: tid %d, toep %p (%x)", __func__, tid, toep, toep->tp_flags); if (toep->tp_flags & TP_ABORT_SHUTDOWN) return; toep->tp_flags |= (TP_ABORT_RPL_PENDING | TP_ABORT_SHUTDOWN); /* Purge the send queue */ sbflush(so_sockbuf_snd(so)); purge_wr_queue(toep); m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_DATA, req); if (m == NULL) CXGB_UNIMPLEMENTED(); req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); req->wr.wrh_lo = htonl(V_WR_TID(tid)); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid)); req->rsvd0 = htonl(tp->snd_nxt); req->rsvd1 = !(toep->tp_flags & TP_DATASENT); req->cmd = CPL_ABORT_SEND_RST; if (tp->t_state == TCPS_SYN_SENT) mbufq_tail(&toep->out_of_order_queue, m); /* defer */ else l2t_send(sc, m, toep->tp_l2t); } int t3_send_rst(struct toedev *tod __unused, struct tcpcb *tp) { send_reset(tp->t_toe); return (0); } /* * Handler for RX_DATA CPL messages. */ static int do_rx_data(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { struct adapter *sc = qs->adap; struct tom_data *td = sc->tom_softc; struct cpl_rx_data *hdr = mtod(m, void *); unsigned int tid = GET_TID(hdr); struct toepcb *toep = lookup_tid(&td->tid_maps, tid); struct inpcb *inp = toep->tp_inp; struct tcpcb *tp; struct socket *so; struct sockbuf *so_rcv; /* Advance over CPL */ m_adj(m, sizeof(*hdr)); /* XXX: revisit. This comes from the T4 TOM */ if (__predict_false(inp == NULL)) { /* * do_pass_establish failed and must be attempting to abort the * connection. Meanwhile, the T4 has sent us data for such a * connection. */ #ifdef notyet KASSERT(toepcb_flag(toep, TPF_ABORT_SHUTDOWN), ("%s: inp NULL and tid isn't being aborted", __func__)); #endif m_freem(m); return (0); } INP_WLOCK(inp); if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) { CTR4(KTR_CXGB, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", __func__, tid, m->m_pkthdr.len, inp->inp_flags); INP_WUNLOCK(inp); m_freem(m); return (0); } if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) toep->tp_delack_mode = hdr->dack_mode; tp = intotcpcb(inp); #ifdef INVARIANTS if (__predict_false(tp->rcv_nxt != be32toh(hdr->seq))) { log(LOG_ERR, "%s: unexpected seq# %x for TID %u, rcv_nxt %x\n", __func__, be32toh(hdr->seq), toep->tp_tid, tp->rcv_nxt); } #endif tp->rcv_nxt += m->m_pkthdr.len; KASSERT(tp->rcv_wnd >= m->m_pkthdr.len, ("%s: negative window size", __func__)); tp->rcv_wnd -= m->m_pkthdr.len; tp->t_rcvtime = ticks; so = inp->inp_socket; so_rcv = &so->so_rcv; SOCKBUF_LOCK(so_rcv); if (__predict_false(so_rcv->sb_state & SBS_CANTRCVMORE)) { CTR3(KTR_CXGB, "%s: tid %u, excess rx (%d bytes)", __func__, tid, m->m_pkthdr.len); SOCKBUF_UNLOCK(so_rcv); INP_WUNLOCK(inp); INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); tp = tcp_drop(tp, ECONNRESET); if (tp) INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); m_freem(m); return (0); } /* receive buffer autosize */ if (so_rcv->sb_flags & SB_AUTOSIZE && V_tcp_do_autorcvbuf && so_rcv->sb_hiwat < V_tcp_autorcvbuf_max && (m->m_pkthdr.len > (sbspace(so_rcv) / 8 * 7) || tp->rcv_wnd < 32768)) { unsigned int hiwat = so_rcv->sb_hiwat; unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc, V_tcp_autorcvbuf_max); if (!sbreserve_locked(so_rcv, newsize, so, NULL)) so_rcv->sb_flags &= ~SB_AUTOSIZE; else toep->tp_rx_credits += newsize - hiwat; } toep->tp_enqueued += m->m_pkthdr.len; sbappendstream_locked(so_rcv, m); sorwakeup_locked(so); SOCKBUF_UNLOCK_ASSERT(so_rcv); INP_WUNLOCK(inp); return (0); } /* * Handler for PEER_CLOSE CPL messages. */ static int do_peer_close(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { struct adapter *sc = qs->adap; struct tom_data *td = sc->tom_softc; const struct cpl_peer_close *hdr = mtod(m, void *); unsigned int tid = GET_TID(hdr); struct toepcb *toep = lookup_tid(&td->tid_maps, tid); struct inpcb *inp = toep->tp_inp; struct tcpcb *tp; struct socket *so; INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); tp = intotcpcb(inp); CTR5(KTR_CXGB, "%s: tid %u (%s), toep_flags 0x%x, inp %p", __func__, tid, tp ? tcpstates[tp->t_state] : "no tp" , toep->tp_flags, inp); if (toep->tp_flags & TP_ABORT_RPL_PENDING) goto done; so = inp_inpcbtosocket(inp); socantrcvmore(so); tp->rcv_nxt++; switch (tp->t_state) { case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; /* FALLTHROUGH */ case TCPS_ESTABLISHED: tp->t_state = TCPS_CLOSE_WAIT; break; case TCPS_FIN_WAIT_1: tp->t_state = TCPS_CLOSING; break; case TCPS_FIN_WAIT_2: tcp_twstart(tp); INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ INP_INFO_RUNLOCK(&V_tcbinfo); INP_WLOCK(inp); toepcb_release(toep); /* no more CPLs expected */ m_freem(m); return (0); default: log(LOG_ERR, "%s: TID %u received PEER_CLOSE in bad state %d\n", __func__, toep->tp_tid, tp->t_state); } done: INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); m_freem(m); return (0); } /* * Handler for CLOSE_CON_RPL CPL messages. peer ACK to our FIN received. */ static int do_close_con_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { struct adapter *sc = qs->adap; struct tom_data *td = sc->tom_softc; const struct cpl_close_con_rpl *rpl = mtod(m, void *); unsigned int tid = GET_TID(rpl); struct toepcb *toep = lookup_tid(&td->tid_maps, tid); struct inpcb *inp = toep->tp_inp; struct tcpcb *tp; struct socket *so; INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); tp = intotcpcb(inp); CTR4(KTR_CXGB, "%s: tid %u (%s), toep_flags 0x%x", __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->tp_flags); if ((toep->tp_flags & TP_ABORT_RPL_PENDING)) goto done; so = inp_inpcbtosocket(inp); tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */ switch (tp->t_state) { case TCPS_CLOSING: tcp_twstart(tp); release: INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ INP_INFO_RUNLOCK(&V_tcbinfo); INP_WLOCK(inp); toepcb_release(toep); /* no more CPLs expected */ m_freem(m); return (0); case TCPS_LAST_ACK: if (tcp_close(tp)) INP_WUNLOCK(inp); goto release; case TCPS_FIN_WAIT_1: if (so->so_rcv.sb_state & SBS_CANTRCVMORE) soisdisconnected(so); tp->t_state = TCPS_FIN_WAIT_2; break; default: log(LOG_ERR, "%s: TID %u received CLOSE_CON_RPL in bad state %d\n", __func__, toep->tp_tid, tp->t_state); } done: INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); m_freem(m); return (0); } static int do_smt_write_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { struct cpl_smt_write_rpl *rpl = mtod(m, void *); if (rpl->status != CPL_ERR_NONE) { log(LOG_ERR, "Unexpected SMT_WRITE_RPL status %u for entry %u\n", rpl->status, GET_TID(rpl)); } m_freem(m); return (0); } static int do_set_tcb_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { struct cpl_set_tcb_rpl *rpl = mtod(m, void *); if (rpl->status != CPL_ERR_NONE) { log(LOG_ERR, "Unexpected SET_TCB_RPL status %u for tid %u\n", rpl->status, GET_TID(rpl)); } m_freem(m); return (0); } /* * Handle an ABORT_RPL_RSS CPL message. */ static int do_abort_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { struct adapter *sc = qs->adap; struct tom_data *td = sc->tom_softc; const struct cpl_abort_rpl_rss *rpl = mtod(m, void *); unsigned int tid = GET_TID(rpl); struct toepcb *toep = lookup_tid(&td->tid_maps, tid); struct inpcb *inp; /* * Ignore replies to post-close aborts indicating that the abort was * requested too late. These connections are terminated when we get * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss * arrives the TID is either no longer used or it has been recycled. */ if (rpl->status == CPL_ERR_ABORT_FAILED) { m_freem(m); return (0); } if (toep->tp_flags & TP_IS_A_SYNQ_ENTRY) return (do_abort_rpl_synqe(qs, r, m)); CTR4(KTR_CXGB, "%s: tid %d, toep %p, status %d", __func__, tid, toep, rpl->status); inp = toep->tp_inp; INP_WLOCK(inp); if (toep->tp_flags & TP_ABORT_RPL_PENDING) { if (!(toep->tp_flags & TP_ABORT_RPL_RCVD)) { toep->tp_flags |= TP_ABORT_RPL_RCVD; INP_WUNLOCK(inp); } else { toep->tp_flags &= ~TP_ABORT_RPL_RCVD; toep->tp_flags &= TP_ABORT_RPL_PENDING; toepcb_release(toep); /* no more CPLs expected */ } } m_freem(m); return (0); } /* * Convert the status code of an ABORT_REQ into a FreeBSD error code. */ static int abort_status_to_errno(struct tcpcb *tp, int abort_reason) { switch (abort_reason) { case CPL_ERR_BAD_SYN: case CPL_ERR_CONN_RESET: return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); case CPL_ERR_XMIT_TIMEDOUT: case CPL_ERR_PERSIST_TIMEDOUT: case CPL_ERR_FINWAIT2_TIMEDOUT: case CPL_ERR_KEEPALIVE_TIMEDOUT: return (ETIMEDOUT); default: return (EIO); } } /* * Returns whether an ABORT_REQ_RSS message is a negative advice. */ static inline int is_neg_adv_abort(unsigned int status) { return status == CPL_ERR_RTX_NEG_ADVICE || status == CPL_ERR_PERSIST_NEG_ADVICE; } void send_abort_rpl(struct toedev *tod, int tid, int qset) { struct mbuf *reply; struct cpl_abort_rpl *rpl; struct adapter *sc = tod->tod_softc; reply = M_GETHDR_OFLD(qset, CPL_PRIORITY_DATA, rpl); if (!reply) CXGB_UNIMPLEMENTED(); rpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL)); rpl->wr.wrh_lo = htonl(V_WR_TID(tid)); OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid)); rpl->cmd = CPL_ABORT_NO_RST; t3_offload_tx(sc, reply); } /* * Handle an ABORT_REQ_RSS CPL message. If we're waiting for an ABORT_RPL we * ignore this request except that we need to reply to it. */ static int do_abort_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { struct adapter *sc = qs->adap; struct tom_data *td = sc->tom_softc; struct toedev *tod = &td->tod; const struct cpl_abort_req_rss *req = mtod(m, void *); unsigned int tid = GET_TID(req); struct toepcb *toep = lookup_tid(&td->tid_maps, tid); struct inpcb *inp; struct tcpcb *tp; struct socket *so; int qset = toep->tp_qset; if (is_neg_adv_abort(req->status)) { CTR4(KTR_CXGB, "%s: negative advice %d for tid %u (%x)", __func__, req->status, tid, toep->tp_flags); m_freem(m); return (0); } if (toep->tp_flags & TP_IS_A_SYNQ_ENTRY) return (do_abort_req_synqe(qs, r, m)); inp = toep->tp_inp; INP_INFO_RLOCK(&V_tcbinfo); /* for tcp_close */ INP_WLOCK(inp); tp = intotcpcb(inp); so = inp->inp_socket; CTR6(KTR_CXGB, "%s: tid %u (%s), toep %p (%x), status %d", __func__, tid, tcpstates[tp->t_state], toep, toep->tp_flags, req->status); if (!(toep->tp_flags & TP_ABORT_REQ_RCVD)) { toep->tp_flags |= TP_ABORT_REQ_RCVD; toep->tp_flags |= TP_ABORT_SHUTDOWN; INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); m_freem(m); return (0); } toep->tp_flags &= ~TP_ABORT_REQ_RCVD; /* * If we'd sent a reset on this toep, we'll ignore this and clean up in * the T3's reply to our reset instead. */ if (toep->tp_flags & TP_ABORT_RPL_PENDING) { toep->tp_flags |= TP_ABORT_RPL_SENT; INP_WUNLOCK(inp); } else { so_error_set(so, abort_status_to_errno(tp, req->status)); tp = tcp_close(tp); if (tp == NULL) INP_WLOCK(inp); /* re-acquire */ toepcb_release(toep); /* no more CPLs expected */ } INP_INFO_RUNLOCK(&V_tcbinfo); send_abort_rpl(tod, tid, qset); m_freem(m); return (0); } static void assign_rxopt(struct tcpcb *tp, uint16_t tcpopt) { struct toepcb *toep = tp->t_toe; struct adapter *sc = toep->tp_tod->tod_softc; tp->t_maxseg = tp->t_maxopd = sc->params.mtus[G_TCPOPT_MSS(tcpopt)] - 40; if (G_TCPOPT_TSTAMP(tcpopt)) { tp->t_flags |= TF_RCVD_TSTMP; tp->t_flags |= TF_REQ_TSTMP; /* forcibly set */ tp->ts_recent = 0; /* XXX */ tp->ts_recent_age = tcp_ts_getticks(); tp->t_maxseg -= TCPOLEN_TSTAMP_APPA; } if (G_TCPOPT_SACK(tcpopt)) tp->t_flags |= TF_SACK_PERMIT; else tp->t_flags &= ~TF_SACK_PERMIT; if (G_TCPOPT_WSCALE_OK(tcpopt)) tp->t_flags |= TF_RCVD_SCALE; if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == (TF_RCVD_SCALE | TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; tp->snd_scale = G_TCPOPT_SND_WSCALE(tcpopt); } } /* * The ISS and IRS are from after the exchange of SYNs and are off by 1. */ void make_established(struct socket *so, uint32_t cpl_iss, uint32_t cpl_irs, uint16_t cpl_tcpopt) { struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct toepcb *toep = tp->t_toe; long bufsize; uint32_t iss = be32toh(cpl_iss) - 1; /* true ISS */ uint32_t irs = be32toh(cpl_irs) - 1; /* true IRS */ uint16_t tcpopt = be16toh(cpl_tcpopt); INP_WLOCK_ASSERT(inp); tp->t_state = TCPS_ESTABLISHED; tp->t_starttime = ticks; TCPSTAT_INC(tcps_connects); CTR4(KTR_CXGB, "%s tid %u, toep %p, inp %p", tcpstates[tp->t_state], toep->tp_tid, toep, inp); tp->irs = irs; tcp_rcvseqinit(tp); tp->rcv_wnd = toep->tp_rx_credits << 10; tp->rcv_adv += tp->rcv_wnd; tp->last_ack_sent = tp->rcv_nxt; /* * If we were unable to send all rx credits via opt0, save the remainder * in rx_credits so that they can be handed over with the next credit * update. */ SOCKBUF_LOCK(&so->so_rcv); bufsize = select_rcv_wnd(so); SOCKBUF_UNLOCK(&so->so_rcv); toep->tp_rx_credits = bufsize - tp->rcv_wnd; tp->iss = iss; tcp_sendseqinit(tp); tp->snd_una = iss + 1; tp->snd_nxt = iss + 1; tp->snd_max = iss + 1; assign_rxopt(tp, tcpopt); soisconnected(so); } /* * Fill in the right TID for CPL messages waiting in the out-of-order queue * and send them to the TOE. */ static void fixup_and_send_ofo(struct toepcb *toep) { struct mbuf *m; struct toedev *tod = toep->tp_tod; struct adapter *sc = tod->tod_softc; unsigned int tid = toep->tp_tid; inp_lock_assert(toep->tp_inp); while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) { struct ofld_hdr *oh = mtod(m, void *); /* * A variety of messages can be waiting but the fields we'll * be touching are common to all so any message type will do. */ struct cpl_close_con_req *p = (void *)(oh + 1); p->wr.wrh_lo = htonl(V_WR_TID(tid)); OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid)); t3_offload_tx(sc, m); } } /* * Process a CPL_ACT_ESTABLISH message. */ static int do_act_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { struct adapter *sc = qs->adap; struct tom_data *td = sc->tom_softc; struct cpl_act_establish *req = mtod(m, void *); unsigned int tid = GET_TID(req); unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); struct toepcb *toep = lookup_atid(&td->tid_maps, atid); struct inpcb *inp = toep->tp_inp; struct tcpcb *tp; struct socket *so; CTR3(KTR_CXGB, "%s: atid %u, tid %u", __func__, atid, tid); free_atid(&td->tid_maps, atid); INP_WLOCK(inp); tp = intotcpcb(inp); KASSERT(toep->tp_qset == qs->idx, ("%s qset mismatch %d %d", __func__, toep->tp_qset, qs->idx)); KASSERT(toep->tp_tid == atid, ("%s atid mismatch %d %d", __func__, toep->tp_tid, atid)); toep->tp_tid = tid; insert_tid(td, toep, tid); if (inp->inp_flags & INP_DROPPED) { /* socket closed by the kernel before hw told us it connected */ send_reset(toep); goto done; } KASSERT(tp->t_state == TCPS_SYN_SENT, ("TID %u expected TCPS_SYN_SENT, found %d.", tid, tp->t_state)); so = inp->inp_socket; make_established(so, req->snd_isn, req->rcv_isn, req->tcp_opt); /* * Now that we finally have a TID send any CPL messages that we had to * defer for lack of a TID. */ if (mbufq_len(&toep->out_of_order_queue)) fixup_and_send_ofo(toep); done: INP_WUNLOCK(inp); m_freem(m); return (0); } /* * Process an acknowledgment of WR completion. Advance snd_una and send the * next batch of work requests from the write queue. */ static void wr_ack(struct toepcb *toep, struct mbuf *m) { struct inpcb *inp = toep->tp_inp; struct tcpcb *tp; struct cpl_wr_ack *hdr = mtod(m, void *); struct socket *so; unsigned int credits = ntohs(hdr->credits); u32 snd_una = ntohl(hdr->snd_una); int bytes = 0; struct sockbuf *snd; struct mbuf *p; struct ofld_hdr *oh; inp_wlock(inp); tp = intotcpcb(inp); so = inp->inp_socket; toep->tp_wr_avail += credits; if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail) toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail; while (credits) { p = peek_wr(toep); if (__predict_false(!p)) { CTR5(KTR_CXGB, "%s: %u extra WR_ACK credits, " "tid %u, state %u, wr_avail %u", __func__, credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail); log(LOG_ERR, "%u WR_ACK credits for TID %u with " "nothing pending, state %u wr_avail=%u\n", credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail); break; } oh = mtod(p, struct ofld_hdr *); KASSERT(credits >= G_HDR_NDESC(oh->flags), ("%s: partial credits? %d %d", __func__, credits, G_HDR_NDESC(oh->flags))); dequeue_wr(toep); credits -= G_HDR_NDESC(oh->flags); bytes += oh->plen; if (oh->flags & F_HDR_SGL) sglist_free(oh->sgl); m_freem(p); } if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) goto out_free; if (tp->snd_una != snd_una) { tp->snd_una = snd_una; tp->ts_recent_age = tcp_ts_getticks(); if (tp->snd_una == tp->snd_nxt) toep->tp_flags &= ~TP_TX_WAIT_IDLE; } snd = so_sockbuf_snd(so); if (bytes) { SOCKBUF_LOCK(snd); sbdrop_locked(snd, bytes); so_sowwakeup_locked(so); } if (snd->sb_sndptroff < snd->sb_cc) t3_push_frames(so, 0); out_free: inp_wunlock(tp->t_inpcb); m_freem(m); } /* * Handler for TX_DATA_ACK CPL messages. */ static int do_wr_ack(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { struct adapter *sc = qs->adap; struct tom_data *td = sc->tom_softc; struct cpl_wr_ack *hdr = mtod(m, void *); unsigned int tid = GET_TID(hdr); struct toepcb *toep = lookup_tid(&td->tid_maps, tid); /* XXX bad race */ if (toep) wr_ack(toep, m); return (0); } void t3_init_cpl_io(struct adapter *sc) { t3_register_cpl_handler(sc, CPL_ACT_ESTABLISH, do_act_establish); t3_register_cpl_handler(sc, CPL_ACT_OPEN_RPL, do_act_open_rpl); t3_register_cpl_handler(sc, CPL_RX_URG_NOTIFY, do_rx_urg_notify); t3_register_cpl_handler(sc, CPL_RX_DATA, do_rx_data); t3_register_cpl_handler(sc, CPL_TX_DMA_ACK, do_wr_ack); t3_register_cpl_handler(sc, CPL_PEER_CLOSE, do_peer_close); t3_register_cpl_handler(sc, CPL_ABORT_REQ_RSS, do_abort_req); t3_register_cpl_handler(sc, CPL_ABORT_RPL_RSS, do_abort_rpl); t3_register_cpl_handler(sc, CPL_CLOSE_CON_RPL, do_close_con_rpl); t3_register_cpl_handler(sc, CPL_SMT_WRITE_RPL, do_smt_write_rpl); t3_register_cpl_handler(sc, CPL_SET_TCB_RPL, do_set_tcb_rpl); } #endif Index: stable/10/sys/dev/cxgbe/tom/t4_tom.c =================================================================== --- stable/10/sys/dev/cxgbe/tom/t4_tom.c (revision 330302) +++ stable/10/sys/dev/cxgbe/tom/t4_tom.c (revision 330303) @@ -1,1270 +1,1269 @@ /*- * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include +#include #include #ifdef TCP_OFFLOAD #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "common/t4_tcb.h" #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" static struct protosw ddp_protosw; static struct pr_usrreqs ddp_usrreqs; static struct protosw ddp6_protosw; static struct pr_usrreqs ddp6_usrreqs; /* Module ops */ static int t4_tom_mod_load(void); static int t4_tom_mod_unload(void); static int t4_tom_modevent(module_t, int, void *); /* ULD ops and helpers */ static int t4_tom_activate(struct adapter *); static int t4_tom_deactivate(struct adapter *); static struct uld_info tom_uld_info = { .uld_id = ULD_TOM, .activate = t4_tom_activate, .deactivate = t4_tom_deactivate, }; static void queue_tid_release(struct adapter *, int); static void release_offload_resources(struct toepcb *); static int alloc_tid_tabs(struct tid_info *); static void free_tid_tabs(struct tid_info *); static int add_lip(struct adapter *, struct in6_addr *); static int delete_lip(struct adapter *, struct in6_addr *); static struct clip_entry *search_lip(struct tom_data *, struct in6_addr *); static void init_clip_table(struct adapter *, struct tom_data *); static void update_clip(struct adapter *, void *); static void t4_clip_task(void *, int); static void update_clip_table(struct adapter *, struct tom_data *); static void destroy_clip_table(struct adapter *, struct tom_data *); static void free_tom_data(struct adapter *, struct tom_data *); static void reclaim_wr_resources(void *, int); static int in6_ifaddr_gen; static eventhandler_tag ifaddr_evhandler; static struct timeout_task clip_task; static void mbufq_init(struct mbufq *q, int limit) { q->head = q->tail = NULL; } static void mbufq_drain(struct mbufq *q) { struct mbuf *m; while ((m = q->head) != NULL) { q->head = m->m_nextpkt; m_freem(m); } q->tail = NULL; } #ifdef INVARIANTS static inline int mbufq_len(const struct mbufq *q) { struct mbuf *m; int len; len = 0; for (m = q->head; m != NULL; m = m->m_nextpkt) len++; return (len); } #endif struct toepcb * alloc_toepcb(struct vi_info *vi, int txqid, int rxqid, int flags) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct toepcb *toep; int tx_credits, txsd_total, len; /* * The firmware counts tx work request credits in units of 16 bytes * each. Reserve room for an ABORT_REQ so the driver never has to worry * about tx credits if it wants to abort a connection. */ tx_credits = sc->params.ofldq_wr_cred; tx_credits -= howmany(sizeof(struct cpl_abort_req), 16); /* * Shortest possible tx work request is a fw_ofld_tx_data_wr + 1 byte * immediate payload, and firmware counts tx work request credits in * units of 16 byte. Calculate the maximum work requests possible. */ txsd_total = tx_credits / howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16); if (txqid < 0) txqid = (arc4random() % vi->nofldtxq) + vi->first_ofld_txq; KASSERT(txqid >= vi->first_ofld_txq && txqid < vi->first_ofld_txq + vi->nofldtxq, ("%s: txqid %d for vi %p (first %d, n %d)", __func__, txqid, vi, vi->first_ofld_txq, vi->nofldtxq)); if (rxqid < 0) rxqid = (arc4random() % vi->nofldrxq) + vi->first_ofld_rxq; KASSERT(rxqid >= vi->first_ofld_rxq && rxqid < vi->first_ofld_rxq + vi->nofldrxq, ("%s: rxqid %d for vi %p (first %d, n %d)", __func__, rxqid, vi, vi->first_ofld_rxq, vi->nofldrxq)); len = offsetof(struct toepcb, txsd) + txsd_total * sizeof(struct ofld_tx_sdesc); toep = malloc(len, M_CXGBE, M_ZERO | flags); if (toep == NULL) return (NULL); toep->td = sc->tom_softc; toep->vi = vi; toep->tx_total = tx_credits; toep->tx_credits = tx_credits; toep->ofld_txq = &sc->sge.ofld_txq[txqid]; toep->ofld_rxq = &sc->sge.ofld_rxq[rxqid]; toep->ctrlq = &sc->sge.ctrlq[pi->port_id]; mbufq_init(&toep->ulp_pduq, INT_MAX); mbufq_init(&toep->ulp_pdu_reclaimq, INT_MAX); toep->txsd_total = txsd_total; toep->txsd_avail = txsd_total; toep->txsd_pidx = 0; toep->txsd_cidx = 0; return (toep); } void free_toepcb(struct toepcb *toep) { KASSERT(!(toep->flags & TPF_ATTACHED), ("%s: attached to an inpcb", __func__)); KASSERT(!(toep->flags & TPF_CPL_PENDING), ("%s: CPL pending", __func__)); free(toep, M_CXGBE); } /* * Set up the socket for TCP offload. */ void offload_socket(struct socket *so, struct toepcb *toep) { struct tom_data *td = toep->td; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct sockbuf *sb; INP_WLOCK_ASSERT(inp); /* Update socket */ sb = &so->so_snd; SOCKBUF_LOCK(sb); sb->sb_flags |= SB_NOCOALESCE; SOCKBUF_UNLOCK(sb); sb = &so->so_rcv; SOCKBUF_LOCK(sb); sb->sb_flags |= SB_NOCOALESCE; if (toep->ulp_mode == ULP_MODE_TCPDDP) { if (inp->inp_vflag & INP_IPV6) so->so_proto = &ddp6_protosw; else so->so_proto = &ddp_protosw; } SOCKBUF_UNLOCK(sb); /* Update TCP PCB */ tp->tod = &td->tod; tp->t_toe = toep; tp->t_flags |= TF_TOE; /* Install an extra hold on inp */ toep->inp = inp; toep->flags |= TPF_ATTACHED; in_pcbref(inp); /* Add the TOE PCB to the active list */ mtx_lock(&td->toep_list_lock); TAILQ_INSERT_HEAD(&td->toep_list, toep, link); mtx_unlock(&td->toep_list_lock); } /* This is _not_ the normal way to "unoffload" a socket. */ void undo_offload_socket(struct socket *so) { struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct toepcb *toep = tp->t_toe; struct tom_data *td = toep->td; struct sockbuf *sb; INP_WLOCK_ASSERT(inp); sb = &so->so_snd; SOCKBUF_LOCK(sb); sb->sb_flags &= ~SB_NOCOALESCE; SOCKBUF_UNLOCK(sb); sb = &so->so_rcv; SOCKBUF_LOCK(sb); sb->sb_flags &= ~SB_NOCOALESCE; SOCKBUF_UNLOCK(sb); tp->tod = NULL; tp->t_toe = NULL; tp->t_flags &= ~TF_TOE; toep->inp = NULL; toep->flags &= ~TPF_ATTACHED; if (in_pcbrele_wlocked(inp)) panic("%s: inp freed.", __func__); mtx_lock(&td->toep_list_lock); TAILQ_REMOVE(&td->toep_list, toep, link); mtx_unlock(&td->toep_list_lock); } static void release_offload_resources(struct toepcb *toep) { struct tom_data *td = toep->td; struct adapter *sc = td_adapter(td); int tid = toep->tid; KASSERT(!(toep->flags & TPF_CPL_PENDING), ("%s: %p has CPL pending.", __func__, toep)); KASSERT(!(toep->flags & TPF_ATTACHED), ("%s: %p is still attached.", __func__, toep)); CTR5(KTR_CXGBE, "%s: toep %p (tid %d, l2te %p, ce %p)", __func__, toep, tid, toep->l2te, toep->ce); /* * These queues should have been emptied at approximately the same time * that a normal connection's socket's so_snd would have been purged or * drained. Do _not_ clean up here. */ MPASS(mbufq_len(&toep->ulp_pduq) == 0); MPASS(mbufq_len(&toep->ulp_pdu_reclaimq) == 0); if (toep->ulp_mode == ULP_MODE_TCPDDP) release_ddp_resources(toep); if (toep->l2te) t4_l2t_release(toep->l2te); if (tid >= 0) { remove_tid(sc, tid, toep->ce ? 2 : 1); release_tid(sc, tid, toep->ctrlq); } if (toep->ce) release_lip(td, toep->ce); mtx_lock(&td->toep_list_lock); TAILQ_REMOVE(&td->toep_list, toep, link); mtx_unlock(&td->toep_list_lock); free_toepcb(toep); } /* * The kernel is done with the TCP PCB and this is our opportunity to unhook the * toepcb hanging off of it. If the TOE driver is also done with the toepcb (no * pending CPL) then it is time to release all resources tied to the toepcb. * * Also gets called when an offloaded active open fails and the TOM wants the * kernel to take the TCP PCB back. */ static void t4_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp) { #if defined(KTR) || defined(INVARIANTS) struct inpcb *inp = tp->t_inpcb; #endif struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(inp); KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); KASSERT(toep->flags & TPF_ATTACHED, ("%s: not attached", __func__)); #ifdef KTR if (tp->t_state == TCPS_SYN_SENT) { CTR6(KTR_CXGBE, "%s: atid %d, toep %p (0x%x), inp %p (0x%x)", __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags); } else { CTR6(KTR_CXGBE, "t4_pcb_detach: tid %d (%s), toep %p (0x%x), inp %p (0x%x)", toep->tid, tcpstates[tp->t_state], toep, toep->flags, inp, inp->inp_flags); } #endif tp->t_toe = NULL; tp->t_flags &= ~TF_TOE; toep->flags &= ~TPF_ATTACHED; if (!(toep->flags & TPF_CPL_PENDING)) release_offload_resources(toep); } /* * setsockopt handler. */ static void t4_ctloutput(struct toedev *tod, struct tcpcb *tp, int dir, int name) { struct adapter *sc = tod->tod_softc; struct toepcb *toep = tp->t_toe; if (dir == SOPT_GET) return; CTR4(KTR_CXGBE, "%s: tp %p, dir %u, name %u", __func__, tp, dir, name); switch (name) { case TCP_NODELAY: t4_set_tcb_field(sc, toep->ctrlq, toep->tid, W_TCB_T_FLAGS, V_TF_NAGLE(1), V_TF_NAGLE(tp->t_flags & TF_NODELAY ? 0 : 1), 0, 0, toep->ofld_rxq->iq.abs_id); break; default: break; } } /* * The TOE driver will not receive any more CPLs for the tid associated with the * toepcb; release the hold on the inpcb. */ void final_cpl_received(struct toepcb *toep) { struct inpcb *inp = toep->inp; KASSERT(inp != NULL, ("%s: inp is NULL", __func__)); INP_WLOCK_ASSERT(inp); KASSERT(toep->flags & TPF_CPL_PENDING, ("%s: CPL not pending already?", __func__)); CTR6(KTR_CXGBE, "%s: tid %d, toep %p (0x%x), inp %p (0x%x)", __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags); toep->inp = NULL; toep->flags &= ~TPF_CPL_PENDING; mbufq_drain(&toep->ulp_pdu_reclaimq); if (!(toep->flags & TPF_ATTACHED)) release_offload_resources(toep); if (!in_pcbrele_wlocked(inp)) INP_WUNLOCK(inp); } void insert_tid(struct adapter *sc, int tid, void *ctx, int ntids) { struct tid_info *t = &sc->tids; t->tid_tab[tid] = ctx; atomic_add_int(&t->tids_in_use, ntids); } void * lookup_tid(struct adapter *sc, int tid) { struct tid_info *t = &sc->tids; return (t->tid_tab[tid]); } void update_tid(struct adapter *sc, int tid, void *ctx) { struct tid_info *t = &sc->tids; t->tid_tab[tid] = ctx; } void remove_tid(struct adapter *sc, int tid, int ntids) { struct tid_info *t = &sc->tids; t->tid_tab[tid] = NULL; atomic_subtract_int(&t->tids_in_use, ntids); } void release_tid(struct adapter *sc, int tid, struct sge_wrq *ctrlq) { struct wrqe *wr; struct cpl_tid_release *req; wr = alloc_wrqe(sizeof(*req), ctrlq); if (wr == NULL) { queue_tid_release(sc, tid); /* defer */ return; } req = wrtod(wr); INIT_TP_WR_MIT_CPL(req, CPL_TID_RELEASE, tid); t4_wrq_tx(sc, wr); } static void queue_tid_release(struct adapter *sc, int tid) { CXGBE_UNIMPLEMENTED("deferred tid release"); } /* * What mtu_idx to use, given a 4-tuple and/or an MSS cap */ int find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, int pmss) { unsigned short *mtus = &sc->params.mtus[0]; int i, mss, n; KASSERT(inc != NULL || pmss > 0, ("%s: at least one of inc/pmss must be specified", __func__)); mss = inc ? tcp_mssopt(inc) : pmss; if (pmss > 0 && mss > pmss) mss = pmss; if (inc->inc_flags & INC_ISIPV6) n = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else n = sizeof(struct ip) + sizeof(struct tcphdr); for (i = 0; i < NMTUS - 1 && mtus[i + 1] <= mss + n; i++) continue; return (i); } /* * Determine the receive window size for a socket. */ u_long select_rcv_wnd(struct socket *so) { unsigned long wnd; SOCKBUF_LOCK_ASSERT(&so->so_rcv); wnd = sbspace(&so->so_rcv); if (wnd < MIN_RCV_WND) wnd = MIN_RCV_WND; return min(wnd, MAX_RCV_WND); } int select_rcv_wscale(void) { int wscale = 0; unsigned long space = sb_max; if (space > MAX_RCV_WND) space = MAX_RCV_WND; while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space) wscale++; return (wscale); } -extern int always_keepalive; - /* * socket so could be a listening socket too. */ uint64_t calc_opt0(struct socket *so, struct vi_info *vi, struct l2t_entry *e, int mtu_idx, int rscale, int rx_credits, int ulp_mode) { uint64_t opt0; KASSERT(rx_credits <= M_RCV_BUFSIZ, ("%s: rcv_bufsiz too high", __func__)); opt0 = F_TCAM_BYPASS | V_WND_SCALE(rscale) | V_MSS_IDX(mtu_idx) | V_ULP_MODE(ulp_mode) | V_RCV_BUFSIZ(rx_credits); if (so != NULL) { struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); - int keepalive = always_keepalive || + int keepalive = tcp_always_keepalive || so_options_get(so) & SO_KEEPALIVE; opt0 |= V_NAGLE((tp->t_flags & TF_NODELAY) == 0); opt0 |= V_KEEP_ALIVE(keepalive != 0); } if (e != NULL) opt0 |= V_L2T_IDX(e->idx); if (vi != NULL) { opt0 |= V_SMAC_SEL(vi->smt_idx); opt0 |= V_TX_CHAN(vi->pi->tx_chan); } return htobe64(opt0); } uint64_t select_ntuple(struct vi_info *vi, struct l2t_entry *e) { struct adapter *sc = vi->pi->adapter; struct tp_params *tp = &sc->params.tp; uint16_t viid = vi->viid; uint64_t ntuple = 0; /* * Initialize each of the fields which we care about which are present * in the Compressed Filter Tuple. */ if (tp->vlan_shift >= 0 && e->vlan != CPL_L2T_VLAN_NONE) ntuple |= (uint64_t)(F_FT_VLAN_VLD | e->vlan) << tp->vlan_shift; if (tp->port_shift >= 0) ntuple |= (uint64_t)e->lport << tp->port_shift; if (tp->protocol_shift >= 0) ntuple |= (uint64_t)IPPROTO_TCP << tp->protocol_shift; if (tp->vnic_shift >= 0) { uint32_t vf = G_FW_VIID_VIN(viid); uint32_t pf = G_FW_VIID_PFN(viid); uint32_t vld = G_FW_VIID_VIVLD(viid); ntuple |= (uint64_t)(V_FT_VNID_ID_VF(vf) | V_FT_VNID_ID_PF(pf) | V_FT_VNID_ID_VLD(vld)) << tp->vnic_shift; } if (is_t4(sc)) return (htobe32((uint32_t)ntuple)); else return (htobe64(V_FILTER_TUPLE(ntuple))); } void set_tcpddp_ulp_mode(struct toepcb *toep) { toep->ulp_mode = ULP_MODE_TCPDDP; toep->ddp_flags = DDP_OK; toep->ddp_score = DDP_LOW_SCORE; } int negative_advice(int status) { return (status == CPL_ERR_RTX_NEG_ADVICE || status == CPL_ERR_PERSIST_NEG_ADVICE || status == CPL_ERR_KEEPALV_NEG_ADVICE); } static int alloc_tid_tabs(struct tid_info *t) { size_t size; unsigned int i; size = t->ntids * sizeof(*t->tid_tab) + t->natids * sizeof(*t->atid_tab) + t->nstids * sizeof(*t->stid_tab); t->tid_tab = malloc(size, M_CXGBE, M_ZERO | M_NOWAIT); if (t->tid_tab == NULL) return (ENOMEM); mtx_init(&t->atid_lock, "atid lock", NULL, MTX_DEF); t->atid_tab = (union aopen_entry *)&t->tid_tab[t->ntids]; t->afree = t->atid_tab; t->atids_in_use = 0; for (i = 1; i < t->natids; i++) t->atid_tab[i - 1].next = &t->atid_tab[i]; t->atid_tab[t->natids - 1].next = NULL; mtx_init(&t->stid_lock, "stid lock", NULL, MTX_DEF); t->stid_tab = (struct listen_ctx **)&t->atid_tab[t->natids]; t->stids_in_use = 0; TAILQ_INIT(&t->stids); t->nstids_free_head = t->nstids; atomic_store_rel_int(&t->tids_in_use, 0); return (0); } static void free_tid_tabs(struct tid_info *t) { KASSERT(t->tids_in_use == 0, ("%s: %d tids still in use.", __func__, t->tids_in_use)); KASSERT(t->atids_in_use == 0, ("%s: %d atids still in use.", __func__, t->atids_in_use)); KASSERT(t->stids_in_use == 0, ("%s: %d tids still in use.", __func__, t->stids_in_use)); free(t->tid_tab, M_CXGBE); t->tid_tab = NULL; if (mtx_initialized(&t->atid_lock)) mtx_destroy(&t->atid_lock); if (mtx_initialized(&t->stid_lock)) mtx_destroy(&t->stid_lock); } static int add_lip(struct adapter *sc, struct in6_addr *lip) { struct fw_clip_cmd c; ASSERT_SYNCHRONIZED_OP(sc); /* mtx_assert(&td->clip_table_lock, MA_OWNED); */ memset(&c, 0, sizeof(c)); c.op_to_write = htonl(V_FW_CMD_OP(FW_CLIP_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE); c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_ALLOC | FW_LEN16(c)); c.ip_hi = *(uint64_t *)&lip->s6_addr[0]; c.ip_lo = *(uint64_t *)&lip->s6_addr[8]; return (-t4_wr_mbox_ns(sc, sc->mbox, &c, sizeof(c), &c)); } static int delete_lip(struct adapter *sc, struct in6_addr *lip) { struct fw_clip_cmd c; ASSERT_SYNCHRONIZED_OP(sc); /* mtx_assert(&td->clip_table_lock, MA_OWNED); */ memset(&c, 0, sizeof(c)); c.op_to_write = htonl(V_FW_CMD_OP(FW_CLIP_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ); c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_FREE | FW_LEN16(c)); c.ip_hi = *(uint64_t *)&lip->s6_addr[0]; c.ip_lo = *(uint64_t *)&lip->s6_addr[8]; return (-t4_wr_mbox_ns(sc, sc->mbox, &c, sizeof(c), &c)); } static struct clip_entry * search_lip(struct tom_data *td, struct in6_addr *lip) { struct clip_entry *ce; mtx_assert(&td->clip_table_lock, MA_OWNED); TAILQ_FOREACH(ce, &td->clip_table, link) { if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip)) return (ce); } return (NULL); } struct clip_entry * hold_lip(struct tom_data *td, struct in6_addr *lip, struct clip_entry *ce) { mtx_lock(&td->clip_table_lock); if (ce == NULL) ce = search_lip(td, lip); if (ce != NULL) ce->refcount++; mtx_unlock(&td->clip_table_lock); return (ce); } void release_lip(struct tom_data *td, struct clip_entry *ce) { mtx_lock(&td->clip_table_lock); KASSERT(search_lip(td, &ce->lip) == ce, ("%s: CLIP entry %p p not in CLIP table.", __func__, ce)); KASSERT(ce->refcount > 0, ("%s: CLIP entry %p has refcount 0", __func__, ce)); --ce->refcount; mtx_unlock(&td->clip_table_lock); } static void init_clip_table(struct adapter *sc, struct tom_data *td) { ASSERT_SYNCHRONIZED_OP(sc); mtx_init(&td->clip_table_lock, "CLIP table lock", NULL, MTX_DEF); TAILQ_INIT(&td->clip_table); td->clip_gen = -1; update_clip_table(sc, td); } static void update_clip(struct adapter *sc, void *arg __unused) { if (begin_synchronized_op(sc, NULL, HOLD_LOCK, "t4tomuc")) return; if (uld_active(sc, ULD_TOM)) update_clip_table(sc, sc->tom_softc); end_synchronized_op(sc, LOCK_HELD); } static void t4_clip_task(void *arg, int count) { t4_iterate(update_clip, NULL); } static void update_clip_table(struct adapter *sc, struct tom_data *td) { struct in6_ifaddr *ia; struct in6_addr *lip, tlip; struct clip_head stale; struct clip_entry *ce, *ce_temp; struct vi_info *vi; int rc, gen, i, j; uintptr_t last_vnet; ASSERT_SYNCHRONIZED_OP(sc); IN6_IFADDR_RLOCK(); mtx_lock(&td->clip_table_lock); gen = atomic_load_acq_int(&in6_ifaddr_gen); if (gen == td->clip_gen) goto done; TAILQ_INIT(&stale); TAILQ_CONCAT(&stale, &td->clip_table, link); /* * last_vnet optimizes the common cases where all if_vnet = NULL (no * VIMAGE) or all if_vnet = vnet0. */ last_vnet = (uintptr_t)(-1); for_each_port(sc, i) for_each_vi(sc->port[i], j, vi) { if (last_vnet == (uintptr_t)vi->ifp->if_vnet) continue; /* XXX: races with if_vmove */ CURVNET_SET(vi->ifp->if_vnet); TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { lip = &ia->ia_addr.sin6_addr; KASSERT(!IN6_IS_ADDR_MULTICAST(lip), ("%s: mcast address in in6_ifaddr list", __func__)); if (IN6_IS_ADDR_LOOPBACK(lip)) continue; if (IN6_IS_SCOPE_EMBED(lip)) { /* Remove the embedded scope */ tlip = *lip; lip = &tlip; in6_clearscope(lip); } /* * XXX: how to weed out the link local address for the * loopback interface? It's fe80::1 usually (always?). */ /* * If it's in the main list then we already know it's * not stale. */ TAILQ_FOREACH(ce, &td->clip_table, link) { if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip)) goto next; } /* * If it's in the stale list we should move it to the * main list. */ TAILQ_FOREACH(ce, &stale, link) { if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip)) { TAILQ_REMOVE(&stale, ce, link); TAILQ_INSERT_TAIL(&td->clip_table, ce, link); goto next; } } /* A new IP6 address; add it to the CLIP table */ ce = malloc(sizeof(*ce), M_CXGBE, M_NOWAIT); memcpy(&ce->lip, lip, sizeof(ce->lip)); ce->refcount = 0; rc = add_lip(sc, lip); if (rc == 0) TAILQ_INSERT_TAIL(&td->clip_table, ce, link); else { char ip[INET6_ADDRSTRLEN]; inet_ntop(AF_INET6, &ce->lip, &ip[0], sizeof(ip)); log(LOG_ERR, "%s: could not add %s (%d)\n", __func__, ip, rc); free(ce, M_CXGBE); } next: continue; } CURVNET_RESTORE(); last_vnet = (uintptr_t)vi->ifp->if_vnet; } /* * Remove stale addresses (those no longer in V_in6_ifaddrhead) that are * no longer referenced by the driver. */ TAILQ_FOREACH_SAFE(ce, &stale, link, ce_temp) { if (ce->refcount == 0) { rc = delete_lip(sc, &ce->lip); if (rc == 0) { TAILQ_REMOVE(&stale, ce, link); free(ce, M_CXGBE); } else { char ip[INET6_ADDRSTRLEN]; inet_ntop(AF_INET6, &ce->lip, &ip[0], sizeof(ip)); log(LOG_ERR, "%s: could not delete %s (%d)\n", __func__, ip, rc); } } } /* The ones that are still referenced need to stay in the CLIP table */ TAILQ_CONCAT(&td->clip_table, &stale, link); td->clip_gen = gen; done: mtx_unlock(&td->clip_table_lock); IN6_IFADDR_RUNLOCK(); } static void destroy_clip_table(struct adapter *sc, struct tom_data *td) { struct clip_entry *ce, *ce_temp; if (mtx_initialized(&td->clip_table_lock)) { mtx_lock(&td->clip_table_lock); TAILQ_FOREACH_SAFE(ce, &td->clip_table, link, ce_temp) { KASSERT(ce->refcount == 0, ("%s: CLIP entry %p still in use (%d)", __func__, ce, ce->refcount)); TAILQ_REMOVE(&td->clip_table, ce, link); delete_lip(sc, &ce->lip); free(ce, M_CXGBE); } mtx_unlock(&td->clip_table_lock); mtx_destroy(&td->clip_table_lock); } } static void free_tom_data(struct adapter *sc, struct tom_data *td) { ASSERT_SYNCHRONIZED_OP(sc); KASSERT(TAILQ_EMPTY(&td->toep_list), ("%s: TOE PCB list is not empty.", __func__)); KASSERT(td->lctx_count == 0, ("%s: lctx hash table is not empty.", __func__)); t4_free_ppod_region(&td->pr); destroy_clip_table(sc, td); if (td->listen_mask != 0) hashdestroy(td->listen_hash, M_CXGBE, td->listen_mask); if (mtx_initialized(&td->unsent_wr_lock)) mtx_destroy(&td->unsent_wr_lock); if (mtx_initialized(&td->lctx_hash_lock)) mtx_destroy(&td->lctx_hash_lock); if (mtx_initialized(&td->toep_list_lock)) mtx_destroy(&td->toep_list_lock); free_tid_tabs(&sc->tids); free(td, M_CXGBE); } static void reclaim_wr_resources(void *arg, int count) { struct tom_data *td = arg; STAILQ_HEAD(, wrqe) twr_list = STAILQ_HEAD_INITIALIZER(twr_list); struct cpl_act_open_req *cpl; u_int opcode, atid; struct wrqe *wr; struct adapter *sc; mtx_lock(&td->unsent_wr_lock); STAILQ_SWAP(&td->unsent_wr_list, &twr_list, wrqe); mtx_unlock(&td->unsent_wr_lock); while ((wr = STAILQ_FIRST(&twr_list)) != NULL) { STAILQ_REMOVE_HEAD(&twr_list, link); cpl = wrtod(wr); opcode = GET_OPCODE(cpl); switch (opcode) { case CPL_ACT_OPEN_REQ: case CPL_ACT_OPEN_REQ6: atid = G_TID_TID(be32toh(OPCODE_TID(cpl))); sc = td_adapter(td); CTR2(KTR_CXGBE, "%s: atid %u ", __func__, atid); act_open_failure_cleanup(sc, atid, EHOSTUNREACH); free(wr, M_CXGBE); break; default: log(LOG_ERR, "%s: leaked work request %p, wr_len %d, " "opcode %x\n", __func__, wr, wr->wr_len, opcode); /* WR not freed here; go look at it with a debugger. */ } } } /* * Ground control to Major TOM * Commencing countdown, engines on */ static int t4_tom_activate(struct adapter *sc) { struct tom_data *td; struct toedev *tod; struct vi_info *vi; struct sge_ofld_rxq *ofld_rxq; int i, j, rc, v; ASSERT_SYNCHRONIZED_OP(sc); /* per-adapter softc for TOM */ td = malloc(sizeof(*td), M_CXGBE, M_ZERO | M_NOWAIT); if (td == NULL) return (ENOMEM); /* List of TOE PCBs and associated lock */ mtx_init(&td->toep_list_lock, "PCB list lock", NULL, MTX_DEF); TAILQ_INIT(&td->toep_list); /* Listen context */ mtx_init(&td->lctx_hash_lock, "lctx hash lock", NULL, MTX_DEF); td->listen_hash = hashinit_flags(LISTEN_HASH_SIZE, M_CXGBE, &td->listen_mask, HASH_NOWAIT); /* List of WRs for which L2 resolution failed */ mtx_init(&td->unsent_wr_lock, "Unsent WR list lock", NULL, MTX_DEF); STAILQ_INIT(&td->unsent_wr_list); TASK_INIT(&td->reclaim_wr_resources, 0, reclaim_wr_resources, td); /* TID tables */ rc = alloc_tid_tabs(&sc->tids); if (rc != 0) goto done; rc = t4_init_ppod_region(&td->pr, &sc->vres.ddp, t4_read_reg(sc, A_ULP_RX_TDDP_PSZ), "TDDP page pods"); if (rc != 0) goto done; t4_set_reg_field(sc, A_ULP_RX_TDDP_TAGMASK, V_TDDPTAGMASK(M_TDDPTAGMASK), td->pr.pr_tag_mask); /* CLIP table for IPv6 offload */ init_clip_table(sc, td); /* toedev ops */ tod = &td->tod; init_toedev(tod); tod->tod_softc = sc; tod->tod_connect = t4_connect; tod->tod_listen_start = t4_listen_start; tod->tod_listen_stop = t4_listen_stop; tod->tod_rcvd = t4_rcvd; tod->tod_output = t4_tod_output; tod->tod_send_rst = t4_send_rst; tod->tod_send_fin = t4_send_fin; tod->tod_pcb_detach = t4_pcb_detach; tod->tod_l2_update = t4_l2_update; tod->tod_syncache_added = t4_syncache_added; tod->tod_syncache_removed = t4_syncache_removed; tod->tod_syncache_respond = t4_syncache_respond; tod->tod_offload_socket = t4_offload_socket; tod->tod_ctloutput = t4_ctloutput; for_each_port(sc, i) { for_each_vi(sc->port[i], v, vi) { TOEDEV(vi->ifp) = &td->tod; for_each_ofld_rxq(vi, j, ofld_rxq) { ofld_rxq->iq.set_tcb_rpl = do_set_tcb_rpl; ofld_rxq->iq.l2t_write_rpl = do_l2t_write_rpl2; } } } sc->tom_softc = td; register_toedev(sc->tom_softc); done: if (rc != 0) free_tom_data(sc, td); return (rc); } static int t4_tom_deactivate(struct adapter *sc) { int rc = 0; struct tom_data *td = sc->tom_softc; ASSERT_SYNCHRONIZED_OP(sc); if (td == NULL) return (0); /* XXX. KASSERT? */ if (sc->offload_map != 0) return (EBUSY); /* at least one port has IFCAP_TOE enabled */ if (uld_active(sc, ULD_IWARP) || uld_active(sc, ULD_ISCSI)) return (EBUSY); /* both iWARP and iSCSI rely on the TOE. */ mtx_lock(&td->toep_list_lock); if (!TAILQ_EMPTY(&td->toep_list)) rc = EBUSY; mtx_unlock(&td->toep_list_lock); mtx_lock(&td->lctx_hash_lock); if (td->lctx_count > 0) rc = EBUSY; mtx_unlock(&td->lctx_hash_lock); taskqueue_drain(taskqueue_thread, &td->reclaim_wr_resources); mtx_lock(&td->unsent_wr_lock); if (!STAILQ_EMPTY(&td->unsent_wr_list)) rc = EBUSY; mtx_unlock(&td->unsent_wr_lock); if (rc == 0) { unregister_toedev(sc->tom_softc); free_tom_data(sc, td); sc->tom_softc = NULL; } return (rc); } static void t4_tom_ifaddr_event(void *arg __unused, struct ifnet *ifp) { atomic_add_rel_int(&in6_ifaddr_gen, 1); taskqueue_enqueue_timeout(taskqueue_thread, &clip_task, -hz / 4); } static int t4_tom_mod_load(void) { int rc; struct protosw *tcp_protosw, *tcp6_protosw; /* CPL handlers */ t4_init_connect_cpl_handlers(); t4_init_listen_cpl_handlers(); t4_init_cpl_io_handlers(); rc = t4_ddp_mod_load(); if (rc != 0) return (rc); tcp_protosw = pffindproto(PF_INET, IPPROTO_TCP, SOCK_STREAM); if (tcp_protosw == NULL) return (ENOPROTOOPT); bcopy(tcp_protosw, &ddp_protosw, sizeof(ddp_protosw)); bcopy(tcp_protosw->pr_usrreqs, &ddp_usrreqs, sizeof(ddp_usrreqs)); ddp_usrreqs.pru_soreceive = t4_soreceive_ddp; ddp_protosw.pr_usrreqs = &ddp_usrreqs; tcp6_protosw = pffindproto(PF_INET6, IPPROTO_TCP, SOCK_STREAM); if (tcp6_protosw == NULL) return (ENOPROTOOPT); bcopy(tcp6_protosw, &ddp6_protosw, sizeof(ddp6_protosw)); bcopy(tcp6_protosw->pr_usrreqs, &ddp6_usrreqs, sizeof(ddp6_usrreqs)); ddp6_usrreqs.pru_soreceive = t4_soreceive_ddp; ddp6_protosw.pr_usrreqs = &ddp6_usrreqs; TIMEOUT_TASK_INIT(taskqueue_thread, &clip_task, 0, t4_clip_task, NULL); ifaddr_evhandler = EVENTHANDLER_REGISTER(ifaddr_event, t4_tom_ifaddr_event, NULL, EVENTHANDLER_PRI_ANY); rc = t4_register_uld(&tom_uld_info); if (rc != 0) t4_tom_mod_unload(); return (rc); } static void tom_uninit(struct adapter *sc, void *arg __unused) { if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4tomun")) return; /* Try to free resources (works only if no port has IFCAP_TOE) */ if (uld_active(sc, ULD_TOM)) t4_deactivate_uld(sc, ULD_TOM); end_synchronized_op(sc, 0); } static int t4_tom_mod_unload(void) { t4_iterate(tom_uninit, NULL); if (t4_unregister_uld(&tom_uld_info) == EBUSY) return (EBUSY); if (ifaddr_evhandler) { EVENTHANDLER_DEREGISTER(ifaddr_event, ifaddr_evhandler); taskqueue_cancel_timeout(taskqueue_thread, &clip_task, NULL); } t4_ddp_mod_unload(); t4_uninit_connect_cpl_handlers(); t4_uninit_listen_cpl_handlers(); t4_uninit_cpl_io_handlers(); return (0); } #endif /* TCP_OFFLOAD */ static int t4_tom_modevent(module_t mod, int cmd, void *arg) { int rc = 0; #ifdef TCP_OFFLOAD switch (cmd) { case MOD_LOAD: rc = t4_tom_mod_load(); break; case MOD_UNLOAD: rc = t4_tom_mod_unload(); break; default: rc = EINVAL; } #else printf("t4_tom: compiled without TCP_OFFLOAD support.\n"); rc = EOPNOTSUPP; #endif return (rc); } static moduledata_t t4_tom_moddata= { "t4_tom", t4_tom_modevent, 0 }; MODULE_VERSION(t4_tom, 1); MODULE_DEPEND(t4_tom, toecore, 1, 1, 1); MODULE_DEPEND(t4_tom, t4nex, 1, 1, 1); DECLARE_MODULE(t4_tom, t4_tom_moddata, SI_SUB_EXEC, SI_ORDER_ANY); Index: stable/10/sys/netinet/tcp_timer.c =================================================================== --- stable/10/sys/netinet/tcp_timer.c (revision 330302) +++ stable/10/sys/netinet/tcp_timer.c (revision 330303) @@ -1,947 +1,949 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #ifdef INET6 #include #endif #include #ifdef TCPDEBUG #include #endif int tcp_persmin; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW, &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval"); int tcp_persmax; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW, &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval"); int tcp_keepinit; SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection"); int tcp_keepidle; SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin"); int tcp_keepintvl; SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes"); int tcp_delacktime; SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW, &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", "Time before a delayed ACK is sent"); int tcp_msl; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); int tcp_rexmit_min; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW, &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", "Minimum Retransmission Timeout"); int tcp_rexmit_slop; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW, &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", "Retransmission Timer Slop"); -static int always_keepalive = 1; +int tcp_always_keepalive = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, - &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); + &tcp_always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); +__strong_reference(tcp_always_keepalive, always_keepalive); int tcp_fast_finwait2_recycle = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, &tcp_fast_finwait2_recycle, 0, "Recycle closed FIN_WAIT_2 connections faster"); int tcp_finwait2_timeout; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW, &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout"); int tcp_keepcnt = TCPTV_KEEPCNT; SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, "Number of keepalive probes to send"); /* max idle probes */ int tcp_maxpersistidle; static int tcp_rexmit_drop_options = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, &tcp_rexmit_drop_options, 0, "Drop TCP options from 3rd and later retransmitted SYN"); static VNET_DEFINE(int, tcp_pmtud_blackhole_detect); #define V_tcp_pmtud_blackhole_detect VNET(tcp_pmtud_blackhole_detect) SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, CTLFLAG_RW, &VNET_NAME(tcp_pmtud_blackhole_detect), 0, "Path MTU Discovery Black Hole Detection Enabled"); static VNET_DEFINE(int, tcp_pmtud_blackhole_activated); #define V_tcp_pmtud_blackhole_activated \ VNET(tcp_pmtud_blackhole_activated) SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated, CTLFLAG_RD, &VNET_NAME(tcp_pmtud_blackhole_activated), 0, "Path MTU Discovery Black Hole Detection, Activation Count"); static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss); #define V_tcp_pmtud_blackhole_activated_min_mss \ VNET(tcp_pmtud_blackhole_activated_min_mss) SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss, CTLFLAG_RD, &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0, "Path MTU Discovery Black Hole Detection, Activation Count at min MSS"); static VNET_DEFINE(int, tcp_pmtud_blackhole_failed); #define V_tcp_pmtud_blackhole_failed VNET(tcp_pmtud_blackhole_failed) SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed, CTLFLAG_RD, &VNET_NAME(tcp_pmtud_blackhole_failed), 0, "Path MTU Discovery Black Hole Detection, Failure Count"); #ifdef INET static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; #define V_tcp_pmtud_blackhole_mss VNET(tcp_pmtud_blackhole_mss) SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, CTLFLAG_RW, &VNET_NAME(tcp_pmtud_blackhole_mss), 0, "Path MTU Discovery Black Hole Detection lowered MSS"); #endif #ifdef INET6 static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; #define V_tcp_v6pmtud_blackhole_mss VNET(tcp_v6pmtud_blackhole_mss) SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, CTLFLAG_RW, &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); #endif static int per_cpu_timers = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, &per_cpu_timers , 0, "run tcp timers on all cpus"); #define INP_CPU(inp) (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \ ((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0) /* * Tcp protocol timeout routine called every 500 ms. * Updates timestamps used for TCP * causes finite state machine actions if timers expire. */ void tcp_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); (void) tcp_tw_2msl_scan(0); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; int tcp_backoff[TCP_MAXRXTSHIFT + 1] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ /* * TCP timer processing. */ void tcp_timer_delack(void *xtp) { struct tcpcb *tp = xtp; struct inpcb *inp; CURVNET_SET(tp->t_vnet); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_delack) || !callout_active(&tp->t_timers->tt_delack)) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_delack); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); KASSERT((tp->t_timers->tt_flags & TT_DELACK) != 0, ("%s: tp %p delack callout should be running", __func__, tp)); tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_delack); (void) tcp_output(tp); INP_WUNLOCK(inp); CURVNET_RESTORE(); } void tcp_timer_2msl(void *xtp) { struct tcpcb *tp = xtp; struct inpcb *inp; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif INP_INFO_RLOCK(&V_tcbinfo); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); tcp_free_sackholes(tp); if (callout_pending(&tp->t_timers->tt_2msl) || !callout_active(&tp->t_timers->tt_2msl)) { INP_WUNLOCK(tp->t_inpcb); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_2msl); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); KASSERT((tp->t_timers->tt_flags & TT_2MSL) != 0, ("%s: tp %p 2msl callout should be running", __func__, tp)); /* * 2 MSL timeout in shutdown went off. If we're closed but * still waiting for peer to close and connection has been idle * too long delete connection control block. Otherwise, check * again in a bit. * * If in TIME_WAIT state just ignore as this timeout is handled in * tcp_tw_2msl_scan(). * * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. * Ignore fact that there were recent incoming segments. */ if ((inp->inp_flags & INP_TIMEWAIT) != 0) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 && tp->t_inpcb && tp->t_inpcb->inp_socket && (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { TCPSTAT_INC(tcps_finwait2_drops); tp = tcp_close(tp); } else { if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) { if (!callout_reset(&tp->t_timers->tt_2msl, TP_KEEPINTVL(tp), tcp_timer_2msl, tp)) { tp->t_timers->tt_flags &= ~TT_2MSL_RST; } } else tp = tcp_close(tp); } #ifdef TCPDEBUG if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif if (tp != NULL) INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } void tcp_timer_keep(void *xtp) { struct tcpcb *tp = xtp; struct tcptemp *t_template; struct inpcb *inp; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif INP_INFO_RLOCK(&V_tcbinfo); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_keep) || !callout_active(&tp->t_timers->tt_keep)) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_keep); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); KASSERT((tp->t_timers->tt_flags & TT_KEEP) != 0, ("%s: tp %p keep callout should be running", __func__, tp)); /* * Keep-alive timer went off; send something * or drop connection if idle for too long. */ TCPSTAT_INC(tcps_keeptimeo); if (tp->t_state < TCPS_ESTABLISHED) goto dropit; - if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && + if ((tcp_always_keepalive || + inp->inp_socket->so_options & SO_KEEPALIVE) && tp->t_state <= TCPS_CLOSING) { if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) goto dropit; /* * Send a packet designed to force a response * if the peer is up and reachable: * either an ACK if the connection is still alive, * or an RST if the peer has closed the connection * due to timeout or reboot. * Using sequence number tp->snd_una-1 * causes the transmitted zero-length segment * to lie outside the receive window; * by the protocol spec, this requires the * correspondent TCP to respond. */ TCPSTAT_INC(tcps_keepprobe); t_template = tcpip_maketemplate(inp); if (t_template) { tcp_respond(tp, t_template->tt_ipgen, &t_template->tt_t, (struct mbuf *)NULL, tp->rcv_nxt, tp->snd_una - 1, 0); free(t_template, M_TEMP); } if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), tcp_timer_keep, tp)) { tp->t_timers->tt_flags &= ~TT_KEEP_RST; } } else if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), tcp_timer_keep, tp)) { tp->t_timers->tt_flags &= ~TT_KEEP_RST; } #ifdef TCPDEBUG if (inp->inp_socket->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; dropit: TCPSTAT_INC(tcps_keepdrops); tp = tcp_drop(tp, ETIMEDOUT); #ifdef TCPDEBUG if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif if (tp != NULL) INP_WUNLOCK(tp->t_inpcb); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } void tcp_timer_persist(void *xtp) { struct tcpcb *tp = xtp; struct inpcb *inp; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif INP_INFO_RLOCK(&V_tcbinfo); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_persist) || !callout_active(&tp->t_timers->tt_persist)) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_persist); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); KASSERT((tp->t_timers->tt_flags & TT_PERSIST) != 0, ("%s: tp %p persist callout should be running", __func__, tp)); /* * Persistance timer into zero window. * Force a byte to be output, if possible. */ TCPSTAT_INC(tcps_persisttimeo); /* * Hack: if the peer is dead/unreachable, we do not * time out if the window is closed. After a full * backoff, drop the connection if the idle time * (no responses to probes) reaches the maximum * backoff that we would use if retransmitting. */ if (tp->t_rxtshift == TCP_MAXRXTSHIFT && (ticks - tp->t_rcvtime >= tcp_maxpersistidle || ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { TCPSTAT_INC(tcps_persistdrop); tp = tcp_drop(tp, ETIMEDOUT); goto out; } /* * If the user has closed the socket then drop a persisting * connection after a much reduced timeout. */ if (tp->t_state > TCPS_CLOSE_WAIT && (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { TCPSTAT_INC(tcps_persistdrop); tp = tcp_drop(tp, ETIMEDOUT); goto out; } tcp_setpersist(tp); tp->t_flags |= TF_FORCEDATA; (void) tcp_output(tp); tp->t_flags &= ~TF_FORCEDATA; out: #ifdef TCPDEBUG if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); #endif if (tp != NULL) INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } void tcp_timer_rexmt(void * xtp) { struct tcpcb *tp = xtp; CURVNET_SET(tp->t_vnet); int rexmt; int headlocked; struct inpcb *inp; #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif INP_INFO_RLOCK(&V_tcbinfo); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_rexmt) || !callout_active(&tp->t_timers->tt_rexmt)) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_rexmt); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); KASSERT((tp->t_timers->tt_flags & TT_REXMT) != 0, ("%s: tp %p rexmt callout should be running", __func__, tp)); tcp_free_sackholes(tp); /* * Retransmission timer went off. Message has not * been acked within retransmit interval. Back off * to a longer retransmit interval and retransmit one segment. */ if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; TCPSTAT_INC(tcps_timeoutdrop); tp = tcp_drop(tp, tp->t_softerror ? tp->t_softerror : ETIMEDOUT); headlocked = 1; goto out; } INP_INFO_RUNLOCK(&V_tcbinfo); headlocked = 0; if (tp->t_state == TCPS_SYN_SENT) { /* * If the SYN was retransmitted, indicate CWND to be * limited to 1 segment in cc_conn_init(). */ tp->snd_cwnd = 1; } else if (tp->t_rxtshift == 1) { /* * first retransmit; record ssthresh and cwnd so they can * be recovered if this turns out to be a "bad" retransmit. * A retransmit is considered "bad" if an ACK for this * segment is received within RTT/2 interval; the assumption * here is that the ACK was already in flight. See * "On Estimating End-to-End Network Path Properties" by * Allman and Paxson for more details. */ tp->snd_cwnd_prev = tp->snd_cwnd; tp->snd_ssthresh_prev = tp->snd_ssthresh; tp->snd_recover_prev = tp->snd_recover; if (IN_FASTRECOVERY(tp->t_flags)) tp->t_flags |= TF_WASFRECOVERY; else tp->t_flags &= ~TF_WASFRECOVERY; if (IN_CONGRECOVERY(tp->t_flags)) tp->t_flags |= TF_WASCRECOVERY; else tp->t_flags &= ~TF_WASCRECOVERY; tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); tp->t_flags |= TF_PREVVALID; } else tp->t_flags &= ~TF_PREVVALID; TCPSTAT_INC(tcps_rexmttimeo); if ((tp->t_state == TCPS_SYN_SENT) || (tp->t_state == TCPS_SYN_RECEIVED)) rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift]; else rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX); /* * We enter the path for PLMTUD if connection is established or, if * connection is FIN_WAIT_1 status, reason for the last is that if * amount of data we send is very small, we could send it in couple of * packets and process straight to FIN. In that case we won't catch * ESTABLISHED state. */ if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) || (tp->t_state == TCPS_FIN_WAIT_1))) { int optlen; #ifdef INET6 int isipv6; #endif /* * Idea here is that at each stage of mtu probe (usually, 1448 * -> 1188 -> 524) should be given 2 chances to recover before * further clamping down. 'tp->t_rxtshift % 2 == 0' should * take care of that. */ if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && (tp->t_rxtshift >= 2 && tp->t_rxtshift % 2 == 0)) { /* * Enter Path MTU Black-hole Detection mechanism: * - Disable Path MTU Discovery (IP "DF" bit). * - Reduce MTU to lower value than what we * negotiated with peer. */ /* Record that we may have found a black hole. */ tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; /* Keep track of previous MSS. */ optlen = tp->t_maxopd - tp->t_maxseg; tp->t_pmtud_saved_maxopd = tp->t_maxopd; /* * Reduce the MSS to blackhole value or to the default * in an attempt to retransmit. */ #ifdef INET6 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; if (isipv6 && tp->t_maxopd > V_tcp_v6pmtud_blackhole_mss) { /* Use the sysctl tuneable blackhole MSS. */ tp->t_maxopd = V_tcp_v6pmtud_blackhole_mss; V_tcp_pmtud_blackhole_activated++; } else if (isipv6) { /* Use the default MSS. */ tp->t_maxopd = V_tcp_v6mssdflt; /* * Disable Path MTU Discovery when we switch to * minmss. */ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; V_tcp_pmtud_blackhole_activated_min_mss++; } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET if (tp->t_maxopd > V_tcp_pmtud_blackhole_mss) { /* Use the sysctl tuneable blackhole MSS. */ tp->t_maxopd = V_tcp_pmtud_blackhole_mss; V_tcp_pmtud_blackhole_activated++; } else { /* Use the default MSS. */ tp->t_maxopd = V_tcp_mssdflt; /* * Disable Path MTU Discovery when we switch to * minmss. */ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; V_tcp_pmtud_blackhole_activated_min_mss++; } #endif tp->t_maxseg = tp->t_maxopd - optlen; /* * Reset the slow-start flight size * as it may depend on the new MSS. */ if (CC_ALGO(tp)->conn_init != NULL) CC_ALGO(tp)->conn_init(tp->ccv); } else { /* * If further retransmissions are still unsuccessful * with a lowered MTU, maybe this isn't a blackhole and * we restore the previous MSS and blackhole detection * flags. * The limit '6' is determined by giving each probe * stage (1448, 1188, 524) 2 chances to recover. */ if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && (tp->t_rxtshift > 6)) { tp->t_flags2 |= TF2_PLPMTU_PMTUD; tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; optlen = tp->t_maxopd - tp->t_maxseg; tp->t_maxopd = tp->t_pmtud_saved_maxopd; tp->t_maxseg = tp->t_maxopd - optlen; V_tcp_pmtud_blackhole_failed++; /* * Reset the slow-start flight size as it * may depend on the new MSS. */ if (CC_ALGO(tp)->conn_init != NULL) CC_ALGO(tp)->conn_init(tp->ccv); } } } /* * Disable RFC1323 and SACK if we haven't got any response to * our third SYN to work-around some broken terminal servers * (most of which have hopefully been retired) that have bad VJ * header compression code which trashes TCP segments containing * unknown-to-them TCP options. */ if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == 3)) tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); /* * If we backed off this far, our srtt estimate is probably bogus. * Clobber it so we'll take the next rtt measurement as our srtt; * move the current srtt into rttvar to keep the current * retransmit times until then. */ if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { #ifdef INET6 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) in6_losing(tp->t_inpcb); #endif tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); tp->t_srtt = 0; } tp->snd_nxt = tp->snd_una; tp->snd_recover = tp->snd_max; /* * Force a segment to be sent. */ tp->t_flags |= TF_ACKNOW; /* * If timing a segment in this window, stop the timer. */ tp->t_rtttime = 0; cc_cong_signal(tp, NULL, CC_RTO); (void) tcp_output(tp); out: #ifdef TCPDEBUG if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif if (tp != NULL) INP_WUNLOCK(inp); if (headlocked) INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } void tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta) { struct callout *t_callout; timeout_t *f_callout; struct inpcb *inp = tp->t_inpcb; int cpu = INP_CPU(inp); uint32_t f_reset; #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) return; #endif if (tp->t_timers->tt_flags & TT_STOPPED) return; switch (timer_type) { case TT_DELACK: t_callout = &tp->t_timers->tt_delack; f_callout = tcp_timer_delack; f_reset = TT_DELACK_RST; break; case TT_REXMT: t_callout = &tp->t_timers->tt_rexmt; f_callout = tcp_timer_rexmt; f_reset = TT_REXMT_RST; break; case TT_PERSIST: t_callout = &tp->t_timers->tt_persist; f_callout = tcp_timer_persist; f_reset = TT_PERSIST_RST; break; case TT_KEEP: t_callout = &tp->t_timers->tt_keep; f_callout = tcp_timer_keep; f_reset = TT_KEEP_RST; break; case TT_2MSL: t_callout = &tp->t_timers->tt_2msl; f_callout = tcp_timer_2msl; f_reset = TT_2MSL_RST; break; default: panic("tp %p bad timer_type %#x", tp, timer_type); } if (delta == 0) { if ((tp->t_timers->tt_flags & timer_type) && callout_stop(t_callout) && (tp->t_timers->tt_flags & f_reset)) { tp->t_timers->tt_flags &= ~(timer_type | f_reset); } } else { if ((tp->t_timers->tt_flags & timer_type) == 0) { tp->t_timers->tt_flags |= (timer_type | f_reset); callout_reset_on(t_callout, delta, f_callout, tp, cpu); } else { /* Reset already running callout on the same CPU. */ if (!callout_reset(t_callout, delta, f_callout, tp)) { /* * Callout not cancelled, consider it as not * properly restarted. */ tp->t_timers->tt_flags &= ~f_reset; } } } } int tcp_timer_active(struct tcpcb *tp, uint32_t timer_type) { struct callout *t_callout; switch (timer_type) { case TT_DELACK: t_callout = &tp->t_timers->tt_delack; break; case TT_REXMT: t_callout = &tp->t_timers->tt_rexmt; break; case TT_PERSIST: t_callout = &tp->t_timers->tt_persist; break; case TT_KEEP: t_callout = &tp->t_timers->tt_keep; break; case TT_2MSL: t_callout = &tp->t_timers->tt_2msl; break; default: panic("tp %p bad timer_type %#x", tp, timer_type); } return callout_active(t_callout); } void tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type) { struct callout *t_callout; timeout_t *f_callout; uint32_t f_reset; tp->t_timers->tt_flags |= TT_STOPPED; switch (timer_type) { case TT_DELACK: t_callout = &tp->t_timers->tt_delack; f_callout = tcp_timer_delack_discard; f_reset = TT_DELACK_RST; break; case TT_REXMT: t_callout = &tp->t_timers->tt_rexmt; f_callout = tcp_timer_rexmt_discard; f_reset = TT_REXMT_RST; break; case TT_PERSIST: t_callout = &tp->t_timers->tt_persist; f_callout = tcp_timer_persist_discard; f_reset = TT_PERSIST_RST; break; case TT_KEEP: t_callout = &tp->t_timers->tt_keep; f_callout = tcp_timer_keep_discard; f_reset = TT_KEEP_RST; break; case TT_2MSL: t_callout = &tp->t_timers->tt_2msl; f_callout = tcp_timer_2msl_discard; f_reset = TT_2MSL_RST; break; default: panic("tp %p bad timer_type %#x", tp, timer_type); } if (tp->t_timers->tt_flags & timer_type) { if (callout_stop(t_callout) && (tp->t_timers->tt_flags & f_reset)) { tp->t_timers->tt_flags &= ~(timer_type | f_reset); } else { /* * Can't stop the callout, defer tcpcb actual deletion * to the last tcp timer discard callout. * The TT_STOPPED flag will ensure that no tcp timer * callouts can be restarted on our behalf, and * past this point currently running callouts waiting * on inp lock will return right away after the * classical check for callout reset/stop events: * callout_pending() || !callout_active() */ callout_reset(t_callout, 1, f_callout, tp); } } } #define ticks_to_msecs(t) (1000*(t) / hz) void tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, struct xtcp_timer *xtimer) { sbintime_t now; bzero(xtimer, sizeof(*xtimer)); if (timer == NULL) return; now = getsbinuptime(); if (callout_active(&timer->tt_delack)) xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_rexmt)) xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_persist)) xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_keep)) xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_2msl)) xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS; xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime); } Index: stable/10/sys/netinet/tcp_timer.h =================================================================== --- stable/10/sys/netinet/tcp_timer.h (revision 330302) +++ stable/10/sys/netinet/tcp_timer.h (revision 330303) @@ -1,213 +1,214 @@ /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_timer.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #ifndef _NETINET_TCP_TIMER_H_ #define _NETINET_TCP_TIMER_H_ /* * The TCPT_REXMT timer is used to force retransmissions. * The TCP has the TCPT_REXMT timer set whenever segments * have been sent for which ACKs are expected but not yet * received. If an ACK is received which advances tp->snd_una, * then the retransmit timer is cleared (if there are no more * outstanding segments) or reset to the base value (if there * are more ACKs expected). Whenever the retransmit timer goes off, * we retransmit one unacknowledged segment, and do a backoff * on the retransmit timer. * * The TCPT_PERSIST timer is used to keep window size information * flowing even if the window goes shut. If all previous transmissions * have been acknowledged (so that there are no retransmissions in progress), * and the window is too small to bother sending anything, then we start * the TCPT_PERSIST timer. When it expires, if the window is nonzero, * we go to transmit state. Otherwise, at intervals send a single byte * into the peer's window to force him to update our window information. * We do this at most as often as TCPT_PERSMIN time intervals, * but no more frequently than the current estimate of round-trip * packet time. The TCPT_PERSIST timer is cleared whenever we receive * a window update from the peer. * * The TCPT_KEEP timer is used to keep connections alive. If an * connection is idle (no segments received) for TCPTV_KEEP_INIT amount of time, * but not yet established, then we drop the connection. Once the connection * is established, if the connection is idle for TCPTV_KEEP_IDLE time * (and keepalives have been enabled on the socket), we begin to probe * the connection. We force the peer to send us a segment by sending: * * This segment is (deliberately) outside the window, and should elicit * an ack segment in response from the peer. If, despite the TCPT_KEEP * initiated segments we cannot elicit a response from a peer in TCPT_MAXIDLE * amount of time probing, then we drop the connection. */ /* * Time constants. */ #define TCPTV_MSL ( 30*hz) /* max seg lifetime (hah!) */ #define TCPTV_SRTTBASE 0 /* base roundtrip time; if 0, no idea yet */ #define TCPTV_RTOBASE ( 3*hz) /* assumed RTO if no info */ #define TCPTV_SRTTDFLT ( 3*hz) /* assumed RTT if no info */ #define TCPTV_PERSMIN ( 5*hz) /* minimum persist interval */ #define TCPTV_PERSMAX ( 60*hz) /* maximum persist interval */ #define TCPTV_KEEP_INIT ( 75*hz) /* initial connect keepalive */ #define TCPTV_KEEP_IDLE (120*60*hz) /* dflt time before probing */ #define TCPTV_KEEPINTVL ( 75*hz) /* default probe interval */ #define TCPTV_KEEPCNT 8 /* max probes before drop */ #define TCPTV_FINWAIT2_TIMEOUT (60*hz) /* FIN_WAIT_2 timeout if no receiver */ /* * Minimum retransmit timer is 3 ticks, for algorithmic stability. * TCPT_RANGESET() will add another TCPTV_CPU_VAR to deal with * the expected worst-case processing variances by the kernels * representing the end points. Such variances do not always show * up in the srtt because the timestamp is often calculated at * the interface rather then at the TCP layer. This value is * typically 50ms. However, it is also possible that delayed * acks (typically 100ms) could create issues so we set the slop * to 200ms to try to cover it. Note that, properly speaking, * delayed-acks should not create a major issue for interactive * environments which 'P'ush the last segment, at least as * long as implementations do the required 'at least one ack * for every two packets' for the non-interactive streaming case. * (maybe the RTO calculation should use 2*RTT instead of RTT * to handle the ack-every-other-packet case). * * The prior minimum of 1*hz (1 second) badly breaks throughput on any * networks faster then a modem that has minor (e.g. 1%) packet loss. */ #define TCPTV_MIN ( hz/33 ) /* minimum allowable value */ #define TCPTV_CPU_VAR ( hz/5 ) /* cpu variance allowed (200ms) */ #define TCPTV_REXMTMAX ( 64*hz) /* max allowable REXMT value */ #define TCPTV_TWTRUNC 8 /* RTO factor to truncate TW */ #define TCP_LINGERTIME 120 /* linger at most 2 minutes */ #define TCP_MAXRXTSHIFT 12 /* maximum retransmits */ #define TCPTV_DELACK ( hz/10 ) /* 100ms timeout */ #ifdef TCPTIMERS static const char *tcptimers[] = { "REXMT", "PERSIST", "KEEP", "2MSL", "DELACK" }; #endif /* * Force a time value to be in a certain range. */ #define TCPT_RANGESET(tv, value, tvmin, tvmax) do { \ (tv) = (value) + tcp_rexmit_slop; \ if ((u_long)(tv) < (u_long)(tvmin)) \ (tv) = (tvmin); \ if ((u_long)(tv) > (u_long)(tvmax)) \ (tv) = (tvmax); \ } while(0) #ifdef _KERNEL struct xtcp_timer; struct tcp_timer { struct callout tt_rexmt; /* retransmit timer */ struct callout tt_persist; /* retransmit persistence */ struct callout tt_keep; /* keepalive */ struct callout tt_2msl; /* 2*msl TIME_WAIT timer */ struct callout tt_delack; /* delayed ACK timer */ uint32_t tt_flags; /* Timers flags */ uint32_t tt_spare; /* TDB */ }; /* * Flags for the tt_flags field. */ #define TT_DELACK 0x0001 #define TT_REXMT 0x0002 #define TT_PERSIST 0x0004 #define TT_KEEP 0x0008 #define TT_2MSL 0x0010 #define TT_MASK (TT_DELACK|TT_REXMT|TT_PERSIST|TT_KEEP|TT_2MSL) #define TT_DELACK_RST 0x0100 #define TT_REXMT_RST 0x0200 #define TT_PERSIST_RST 0x0400 #define TT_KEEP_RST 0x0800 #define TT_2MSL_RST 0x1000 #define TT_STOPPED 0x00010000 #define TP_KEEPINIT(tp) ((tp)->t_keepinit ? (tp)->t_keepinit : tcp_keepinit) #define TP_KEEPIDLE(tp) ((tp)->t_keepidle ? (tp)->t_keepidle : tcp_keepidle) #define TP_KEEPINTVL(tp) ((tp)->t_keepintvl ? (tp)->t_keepintvl : tcp_keepintvl) #define TP_KEEPCNT(tp) ((tp)->t_keepcnt ? (tp)->t_keepcnt : tcp_keepcnt) #define TP_MAXIDLE(tp) (TP_KEEPCNT(tp) * TP_KEEPINTVL(tp)) extern int tcp_persmin; /* minimum persist interval */ extern int tcp_persmax; /* maximum persist interval */ extern int tcp_keepinit; /* time to establish connection */ extern int tcp_keepidle; /* time before keepalive probes begin */ extern int tcp_keepintvl; /* time between keepalive probes */ extern int tcp_keepcnt; /* number of keepalives */ extern int tcp_delacktime; /* time before sending a delayed ACK */ extern int tcp_maxpersistidle; extern int tcp_rexmit_min; extern int tcp_rexmit_slop; extern int tcp_msl; extern int tcp_ttl; /* time to live for TCP segs */ extern int tcp_backoff[]; extern int tcp_syn_backoff[]; +extern int tcp_always_keepalive; extern int tcp_finwait2_timeout; extern int tcp_fast_finwait2_recycle; void tcp_timer_init(void); void tcp_timer_2msl(void *xtp); struct tcptw * tcp_tw_2msl_scan(int reuse); /* XXX temporary? */ void tcp_timer_keep(void *xtp); void tcp_timer_persist(void *xtp); void tcp_timer_rexmt(void *xtp); void tcp_timer_delack(void *xtp); void tcp_timer_2msl_discard(void *xtp); void tcp_timer_keep_discard(void *xtp); void tcp_timer_persist_discard(void *xtp); void tcp_timer_rexmt_discard(void *xtp); void tcp_timer_delack_discard(void *xtp); void tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, struct xtcp_timer *xtimer); #endif /* _KERNEL */ #endif /* !_NETINET_TCP_TIMER_H_ */ Index: stable/10 =================================================================== --- stable/10 (revision 330302) +++ stable/10 (revision 330303) Property changes on: stable/10 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r328608 Index: stable/11/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c =================================================================== --- stable/11/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c (revision 330302) +++ stable/11/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c (revision 330303) @@ -1,1811 +1,1810 @@ /*- * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #ifdef TCP_OFFLOAD #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include #include #include #include #include #include #include "cxgb_include.h" #include "ulp/tom/cxgb_l2t.h" #include "ulp/tom/cxgb_tom.h" #include "ulp/tom/cxgb_toepcb.h" VNET_DECLARE(int, tcp_do_autosndbuf); #define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) VNET_DECLARE(int, tcp_autosndbuf_inc); #define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc) VNET_DECLARE(int, tcp_autosndbuf_max); #define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) VNET_DECLARE(int, tcp_do_autorcvbuf); #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) VNET_DECLARE(int, tcp_autorcvbuf_inc); #define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) VNET_DECLARE(int, tcp_autorcvbuf_max); #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) -extern int always_keepalive; /* * For ULP connections HW may add headers, e.g., for digests, that aren't part * of the messages sent by the host but that are part of the TCP payload and * therefore consume TCP sequence space. Tx connection parameters that * operate in TCP sequence space are affected by the HW additions and need to * compensate for them to accurately track TCP sequence numbers. This array * contains the compensating extra lengths for ULP packets. It is indexed by * a packet's ULP submode. */ const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8}; /* * Max receive window supported by HW in bytes. Only a small part of it can * be set through option0, the rest needs to be set through RX_DATA_ACK. */ #define MAX_RCV_WND ((1U << 27) - 1) /* * Min receive window. We want it to be large enough to accommodate receive * coalescing, handle jumbo frames, and not trigger sender SWS avoidance. */ #define MIN_RCV_WND (24 * 1024U) #define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS) static void t3_release_offload_resources(struct toepcb *); static void send_reset(struct toepcb *toep); /* * Called after the last CPL for the toepcb has been received. * * The inp must be wlocked on entry and is unlocked (or maybe destroyed) by the * time this function exits. */ static int toepcb_release(struct toepcb *toep) { struct inpcb *inp = toep->tp_inp; struct toedev *tod = toep->tp_tod; struct tom_data *td = t3_tomdata(tod); int rc; INP_WLOCK_ASSERT(inp); KASSERT(!(toep->tp_flags & TP_CPL_DONE), ("%s: double release?", __func__)); CTR2(KTR_CXGB, "%s: tid %d", __func__, toep->tp_tid); toep->tp_flags |= TP_CPL_DONE; toep->tp_inp = NULL; mtx_lock(&td->toep_list_lock); TAILQ_REMOVE(&td->toep_list, toep, link); mtx_unlock(&td->toep_list_lock); if (!(toep->tp_flags & TP_ATTACHED)) t3_release_offload_resources(toep); rc = in_pcbrele_wlocked(inp); if (!rc) INP_WUNLOCK(inp); return (rc); } /* * One sided detach. The tcpcb is going away and we need to unhook the toepcb * hanging off it. If the TOE driver is also done with the toepcb we'll release * all offload resources. */ static void toepcb_detach(struct inpcb *inp) { struct toepcb *toep; struct tcpcb *tp; KASSERT(inp, ("%s: inp is NULL", __func__)); INP_WLOCK_ASSERT(inp); tp = intotcpcb(inp); toep = tp->t_toe; KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); KASSERT(toep->tp_flags & TP_ATTACHED, ("%s: not attached", __func__)); CTR6(KTR_CXGB, "%s: %s %u, toep %p, inp %p, tp %p", __func__, tp->t_state == TCPS_SYN_SENT ? "atid" : "tid", toep->tp_tid, toep, inp, tp); tp->t_toe = NULL; tp->t_flags &= ~TF_TOE; toep->tp_flags &= ~TP_ATTACHED; if (toep->tp_flags & TP_CPL_DONE) t3_release_offload_resources(toep); } void t3_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp) { toepcb_detach(tp->t_inpcb); } static int alloc_atid(struct tid_info *t, void *ctx) { int atid = -1; mtx_lock(&t->atid_lock); if (t->afree) { union active_open_entry *p = t->afree; atid = (p - t->atid_tab) + t->atid_base; t->afree = p->next; p->ctx = ctx; t->atids_in_use++; } mtx_unlock(&t->atid_lock); return (atid); } static void free_atid(struct tid_info *t, int atid) { union active_open_entry *p = atid2entry(t, atid); mtx_lock(&t->atid_lock); p->next = t->afree; t->afree = p; t->atids_in_use--; mtx_unlock(&t->atid_lock); } void insert_tid(struct tom_data *td, void *ctx, unsigned int tid) { struct tid_info *t = &td->tid_maps; t->tid_tab[tid] = ctx; atomic_add_int(&t->tids_in_use, 1); } void update_tid(struct tom_data *td, void *ctx, unsigned int tid) { struct tid_info *t = &td->tid_maps; t->tid_tab[tid] = ctx; } void remove_tid(struct tom_data *td, unsigned int tid) { struct tid_info *t = &td->tid_maps; t->tid_tab[tid] = NULL; atomic_add_int(&t->tids_in_use, -1); } /* use ctx as a next pointer in the tid release list */ void queue_tid_release(struct toedev *tod, unsigned int tid) { struct tom_data *td = t3_tomdata(tod); void **p = &td->tid_maps.tid_tab[tid]; struct adapter *sc = tod->tod_softc; mtx_lock(&td->tid_release_lock); *p = td->tid_release_list; td->tid_release_list = p; if (!*p) taskqueue_enqueue(sc->tq, &td->tid_release_task); mtx_unlock(&td->tid_release_lock); } /* * Populate a TID_RELEASE WR. */ static inline void mk_tid_release(struct cpl_tid_release *cpl, unsigned int tid) { cpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); OPCODE_TID(cpl) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid)); } void release_tid(struct toedev *tod, unsigned int tid, int qset) { struct tom_data *td = t3_tomdata(tod); struct adapter *sc = tod->tod_softc; struct mbuf *m; struct cpl_tid_release *cpl; #ifdef INVARIANTS struct tid_info *t = &td->tid_maps; #endif KASSERT(tid < t->ntids, ("%s: tid=%d, ntids=%d", __func__, tid, t->ntids)); m = M_GETHDR_OFLD(qset, CPL_PRIORITY_CONTROL, cpl); if (m) { mk_tid_release(cpl, tid); t3_offload_tx(sc, m); remove_tid(td, tid); } else queue_tid_release(tod, tid); } void t3_process_tid_release_list(void *data, int pending) { struct mbuf *m; struct tom_data *td = data; struct adapter *sc = td->tod.tod_softc; mtx_lock(&td->tid_release_lock); while (td->tid_release_list) { void **p = td->tid_release_list; unsigned int tid = p - td->tid_maps.tid_tab; struct cpl_tid_release *cpl; td->tid_release_list = (void **)*p; m = M_GETHDR_OFLD(0, CPL_PRIORITY_CONTROL, cpl); /* qs 0 here */ if (m == NULL) break; /* XXX: who reschedules the release task? */ mtx_unlock(&td->tid_release_lock); mk_tid_release(cpl, tid); t3_offload_tx(sc, m); remove_tid(td, tid); mtx_lock(&td->tid_release_lock); } mtx_unlock(&td->tid_release_lock); } static void close_conn(struct adapter *sc, struct toepcb *toep) { struct mbuf *m; struct cpl_close_con_req *req; if (toep->tp_flags & TP_FIN_SENT) return; m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_DATA, req); if (m == NULL) CXGB_UNIMPLEMENTED(); req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON)); req->wr.wrh_lo = htonl(V_WR_TID(toep->tp_tid)); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, toep->tp_tid)); req->rsvd = 0; toep->tp_flags |= TP_FIN_SENT; t3_offload_tx(sc, m); } static inline void make_tx_data_wr(struct socket *so, struct tx_data_wr *req, int len, struct mbuf *tail) { struct tcpcb *tp = so_sototcpcb(so); struct toepcb *toep = tp->t_toe; struct sockbuf *snd; inp_lock_assert(tp->t_inpcb); snd = so_sockbuf_snd(so); req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); req->wr.wrh_lo = htonl(V_WR_TID(toep->tp_tid)); /* len includes the length of any HW ULP additions */ req->len = htonl(len); req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx)); /* V_TX_ULP_SUBMODE sets both the mode and submode */ req->flags = htonl(V_TX_ULP_SUBMODE(toep->tp_ulp_mode) | V_TX_URG(0) | V_TX_SHOVE(!(tp->t_flags & TF_MORETOCOME) && (tail ? 0 : 1))); req->sndseq = htonl(tp->snd_nxt); if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) { struct adapter *sc = toep->tp_tod->tod_softc; int cpu_idx = sc->rrss_map[toep->tp_qset]; req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT | V_TX_CPU_IDX(cpu_idx)); /* Sendbuffer is in units of 32KB. */ if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE) req->param |= htonl(V_TX_SNDBUF(VNET(tcp_autosndbuf_max) >> 15)); else req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15)); toep->tp_flags |= TP_DATASENT; } } /* * TOM_XXX_DUPLICATION sgl_len, calc_tx_descs, calc_tx_descs_ofld, mbuf_wrs, etc. * TOM_XXX_MOVE to some common header file. */ /* * IMM_LEN: # of bytes that can be tx'd as immediate data. There are 16 flits * in a tx desc; subtract 3 for tx_data_wr (including the WR header), and 1 more * for the second gen bit flit. This leaves us with 12 flits. * * descs_to_sgllen: # of SGL entries that can fit into the given # of tx descs. * The first desc has a tx_data_wr (which includes the WR header), the rest have * the WR header only. All descs have the second gen bit flit. * * sgllen_to_descs: # of tx descs used up by an sgl of given length. The first * desc has a tx_data_wr (which includes the WR header), the rest have the WR * header only. All descs have the second gen bit flit. * * flits_to_sgllen: # of SGL entries that can be fit in the given # of flits. * */ #define IMM_LEN 96 static int descs_to_sgllen[TX_MAX_DESC + 1] = {0, 8, 17, 26, 35}; static int sgllen_to_descs[TX_MAX_SEGS] = { 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, /* 0 - 9 */ 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, /* 10 - 19 */ 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, /* 20 - 29 */ 4, 4, 4, 4, 4, 4 /* 30 - 35 */ }; #if 0 static int flits_to_sgllen[TX_DESC_FLITS + 1] = { 0, 0, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9, 10, 10 }; #endif #if SGE_NUM_GENBITS != 2 #error "SGE_NUM_GENBITS really must be 2" #endif int t3_push_frames(struct socket *so, int req_completion) { struct tcpcb *tp = so_sototcpcb(so); struct toepcb *toep = tp->t_toe; struct mbuf *m0, *sndptr, *m; struct toedev *tod = toep->tp_tod; struct adapter *sc = tod->tod_softc; int bytes, ndesc, total_bytes = 0, mlen; struct sockbuf *snd; struct sglist *sgl; struct ofld_hdr *oh; caddr_t dst; struct tx_data_wr *wr; inp_lock_assert(tp->t_inpcb); snd = so_sockbuf_snd(so); SOCKBUF_LOCK(snd); /* * Autosize the send buffer. */ if (snd->sb_flags & SB_AUTOSIZE && VNET(tcp_do_autosndbuf)) { if (sbused(snd) >= (snd->sb_hiwat / 8 * 7) && sbused(snd) < VNET(tcp_autosndbuf_max)) { if (!sbreserve_locked(snd, min(snd->sb_hiwat + VNET(tcp_autosndbuf_inc), VNET(tcp_autosndbuf_max)), so, curthread)) snd->sb_flags &= ~SB_AUTOSIZE; } } if (toep->tp_m_last && toep->tp_m_last == snd->sb_sndptr) sndptr = toep->tp_m_last->m_next; else sndptr = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb; /* Nothing to send or no WRs available for sending data */ if (toep->tp_wr_avail == 0 || sndptr == NULL) goto out; /* Something to send and at least 1 WR available */ while (toep->tp_wr_avail && sndptr != NULL) { m0 = m_gethdr(M_NOWAIT, MT_DATA); if (m0 == NULL) break; oh = mtod(m0, struct ofld_hdr *); wr = (void *)(oh + 1); dst = (void *)(wr + 1); m0->m_pkthdr.len = m0->m_len = sizeof(*oh) + sizeof(*wr); oh->flags = V_HDR_CTRL(CPL_PRIORITY_DATA) | F_HDR_DF | V_HDR_QSET(toep->tp_qset); /* * Try to construct an immediate data WR if possible. Stuff as * much data into it as possible, one whole mbuf at a time. */ mlen = sndptr->m_len; ndesc = bytes = 0; while (mlen <= IMM_LEN - bytes) { bcopy(sndptr->m_data, dst, mlen); bytes += mlen; dst += mlen; if (!(sndptr = sndptr->m_next)) break; mlen = sndptr->m_len; } if (bytes) { /* Was able to fit 'bytes' bytes in an immediate WR */ ndesc = 1; make_tx_data_wr(so, wr, bytes, sndptr); m0->m_len += bytes; m0->m_pkthdr.len = m0->m_len; } else { int wr_avail = min(toep->tp_wr_avail, TX_MAX_DESC); /* Need to make an SGL */ sgl = sglist_alloc(descs_to_sgllen[wr_avail], M_NOWAIT); if (sgl == NULL) break; for (m = sndptr; m != NULL; m = m->m_next) { if ((mlen = m->m_len) > 0) { if (sglist_append(sgl, m->m_data, mlen)) break; } bytes += mlen; } sndptr = m; if (bytes == 0) { sglist_free(sgl); break; } ndesc = sgllen_to_descs[sgl->sg_nseg]; oh->flags |= F_HDR_SGL; oh->sgl = sgl; make_tx_data_wr(so, wr, bytes, sndptr); } oh->flags |= V_HDR_NDESC(ndesc); oh->plen = bytes; snd->sb_sndptr = sndptr; snd->sb_sndptroff += bytes; if (sndptr == NULL) { snd->sb_sndptr = snd->sb_mbtail; snd->sb_sndptroff -= snd->sb_mbtail->m_len; toep->tp_m_last = snd->sb_mbtail; } else toep->tp_m_last = NULL; total_bytes += bytes; toep->tp_wr_avail -= ndesc; toep->tp_wr_unacked += ndesc; if ((req_completion && toep->tp_wr_unacked == ndesc) || toep->tp_wr_unacked >= toep->tp_wr_max / 2) { wr->wr.wrh_hi |= htonl(F_WR_COMPL); toep->tp_wr_unacked = 0; } enqueue_wr(toep, m0); l2t_send(sc, m0, toep->tp_l2t); } out: SOCKBUF_UNLOCK(snd); if (sndptr == NULL && (toep->tp_flags & TP_SEND_FIN)) close_conn(sc, toep); return (total_bytes); } static int send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits) { struct mbuf *m; struct cpl_rx_data_ack *req; uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_CONTROL, req); if (m == NULL) return (0); req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); req->wr.wrh_lo = 0; OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); req->credit_dack = htonl(dack | V_RX_CREDITS(credits)); t3_offload_tx(sc, m); return (credits); } void t3_rcvd(struct toedev *tod, struct tcpcb *tp) { struct adapter *sc = tod->tod_softc; struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; struct sockbuf *so_rcv = &so->so_rcv; struct toepcb *toep = tp->t_toe; int must_send; INP_WLOCK_ASSERT(inp); SOCKBUF_LOCK(so_rcv); KASSERT(toep->tp_enqueued >= sbused(so_rcv), ("%s: sbused(so_rcv) > enqueued", __func__)); toep->tp_rx_credits += toep->tp_enqueued - sbused(so_rcv); toep->tp_enqueued = sbused(so_rcv); SOCKBUF_UNLOCK(so_rcv); must_send = toep->tp_rx_credits + 16384 >= tp->rcv_wnd; if (must_send || toep->tp_rx_credits >= 15 * 1024) { int credits; credits = send_rx_credits(sc, toep, toep->tp_rx_credits); toep->tp_rx_credits -= credits; tp->rcv_wnd += credits; tp->rcv_adv += credits; } } static int do_rx_urg_notify(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { struct adapter *sc = qs->adap; struct tom_data *td = sc->tom_softc; struct cpl_rx_urg_notify *hdr = mtod(m, void *); unsigned int tid = GET_TID(hdr); struct toepcb *toep = lookup_tid(&td->tid_maps, tid); log(LOG_ERR, "%s: tid %u inp %p", __func__, tid, toep->tp_inp); m_freem(m); return (0); } int t3_send_fin(struct toedev *tod, struct tcpcb *tp) { struct toepcb *toep = tp->t_toe; struct inpcb *inp = tp->t_inpcb; struct socket *so = inp_inpcbtosocket(inp); #if defined(KTR) unsigned int tid = toep->tp_tid; #endif INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); CTR4(KTR_CXGB, "%s: tid %d, toep %p, flags %x", __func__, tid, toep, toep->tp_flags); toep->tp_flags |= TP_SEND_FIN; t3_push_frames(so, 1); return (0); } int t3_tod_output(struct toedev *tod, struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; t3_push_frames(so, 1); return (0); } /* What mtu_idx to use, given a 4-tuple and/or an MSS cap */ int find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, int pmss) { unsigned short *mtus = &sc->params.mtus[0]; int i = 0, mss; KASSERT(inc != NULL || pmss > 0, ("%s: at least one of inc/pmss must be specified", __func__)); mss = inc ? tcp_mssopt(inc) : pmss; if (pmss > 0 && mss > pmss) mss = pmss; while (i < NMTUS - 1 && mtus[i + 1] <= mss + 40) ++i; return (i); } static inline void purge_wr_queue(struct toepcb *toep) { struct mbuf *m; struct ofld_hdr *oh; while ((m = mbufq_dequeue(&toep->wr_list)) != NULL) { oh = mtod(m, struct ofld_hdr *); if (oh->flags & F_HDR_SGL) sglist_free(oh->sgl); m_freem(m); } } /* * Release cxgb(4) and T3 resources held by an offload connection (TID, L2T * entry, etc.) */ static void t3_release_offload_resources(struct toepcb *toep) { struct toedev *tod = toep->tp_tod; struct tom_data *td = t3_tomdata(tod); /* * The TOM explicitly detaches its toepcb from the system's inp before * it releases the offload resources. */ if (toep->tp_inp) { panic("%s: inp %p still attached to toepcb %p", __func__, toep->tp_inp, toep); } if (toep->tp_wr_avail != toep->tp_wr_max) purge_wr_queue(toep); if (toep->tp_l2t) { l2t_release(td->l2t, toep->tp_l2t); toep->tp_l2t = NULL; } if (toep->tp_tid >= 0) release_tid(tod, toep->tp_tid, toep->tp_qset); toepcb_free(toep); } /* * Determine the receive window size for a socket. */ unsigned long select_rcv_wnd(struct socket *so) { unsigned long wnd; SOCKBUF_LOCK_ASSERT(&so->so_rcv); wnd = sbspace(&so->so_rcv); if (wnd < MIN_RCV_WND) wnd = MIN_RCV_WND; return min(wnd, MAX_RCV_WND); } int select_rcv_wscale(void) { int wscale = 0; unsigned long space = sb_max; if (space > MAX_RCV_WND) space = MAX_RCV_WND; while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space) wscale++; return (wscale); } /* * Set up the socket for TCP offload. */ void offload_socket(struct socket *so, struct toepcb *toep) { struct toedev *tod = toep->tp_tod; struct tom_data *td = t3_tomdata(tod); struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); INP_WLOCK_ASSERT(inp); /* Update socket */ SOCKBUF_LOCK(&so->so_snd); so_sockbuf_snd(so)->sb_flags |= SB_NOCOALESCE; SOCKBUF_UNLOCK(&so->so_snd); SOCKBUF_LOCK(&so->so_rcv); so_sockbuf_rcv(so)->sb_flags |= SB_NOCOALESCE; SOCKBUF_UNLOCK(&so->so_rcv); /* Update TCP PCB */ tp->tod = toep->tp_tod; tp->t_toe = toep; tp->t_flags |= TF_TOE; /* Install an extra hold on inp */ toep->tp_inp = inp; toep->tp_flags |= TP_ATTACHED; in_pcbref(inp); /* Add the TOE PCB to the active list */ mtx_lock(&td->toep_list_lock); TAILQ_INSERT_HEAD(&td->toep_list, toep, link); mtx_unlock(&td->toep_list_lock); } /* This is _not_ the normal way to "unoffload" a socket. */ void undo_offload_socket(struct socket *so) { struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct toepcb *toep = tp->t_toe; struct toedev *tod = toep->tp_tod; struct tom_data *td = t3_tomdata(tod); INP_WLOCK_ASSERT(inp); so_sockbuf_snd(so)->sb_flags &= ~SB_NOCOALESCE; so_sockbuf_rcv(so)->sb_flags &= ~SB_NOCOALESCE; tp->tod = NULL; tp->t_toe = NULL; tp->t_flags &= ~TF_TOE; toep->tp_inp = NULL; toep->tp_flags &= ~TP_ATTACHED; if (in_pcbrele_wlocked(inp)) panic("%s: inp freed.", __func__); mtx_lock(&td->toep_list_lock); TAILQ_REMOVE(&td->toep_list, toep, link); mtx_unlock(&td->toep_list_lock); } /* * Socket could be a listening socket, and we may not have a toepcb at all at * this time. */ uint32_t calc_opt0h(struct socket *so, int mtu_idx, int rscale, struct l2t_entry *e) { uint32_t opt0h = F_TCAM_BYPASS | V_WND_SCALE(rscale) | V_MSS_IDX(mtu_idx); if (so != NULL) { struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); - int keepalive = always_keepalive || + int keepalive = tcp_always_keepalive || so_options_get(so) & SO_KEEPALIVE; opt0h |= V_NAGLE((tp->t_flags & TF_NODELAY) == 0); opt0h |= V_KEEP_ALIVE(keepalive != 0); } if (e != NULL) opt0h |= V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx); return (htobe32(opt0h)); } uint32_t calc_opt0l(struct socket *so, int rcv_bufsize) { uint32_t opt0l = V_ULP_MODE(ULP_MODE_NONE) | V_RCV_BUFSIZ(rcv_bufsize); KASSERT(rcv_bufsize <= M_RCV_BUFSIZ, ("%s: rcv_bufsize (%d) is too high", __func__, rcv_bufsize)); if (so != NULL) /* optional because no one cares about IP TOS */ opt0l |= V_TOS(INP_TOS(sotoinpcb(so))); return (htobe32(opt0l)); } /* * Convert an ACT_OPEN_RPL status to an errno. */ static int act_open_rpl_status_to_errno(int status) { switch (status) { case CPL_ERR_CONN_RESET: return (ECONNREFUSED); case CPL_ERR_ARP_MISS: return (EHOSTUNREACH); case CPL_ERR_CONN_TIMEDOUT: return (ETIMEDOUT); case CPL_ERR_TCAM_FULL: return (EAGAIN); case CPL_ERR_CONN_EXIST: log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n"); return (EAGAIN); default: return (EIO); } } /* * Return whether a failed active open has allocated a TID */ static inline int act_open_has_tid(int status) { return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST && status != CPL_ERR_ARP_MISS; } /* * Active open failed. */ static int do_act_open_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { struct adapter *sc = qs->adap; struct tom_data *td = sc->tom_softc; struct toedev *tod = &td->tod; struct cpl_act_open_rpl *rpl = mtod(m, void *); unsigned int atid = G_TID(ntohl(rpl->atid)); struct toepcb *toep = lookup_atid(&td->tid_maps, atid); struct inpcb *inp = toep->tp_inp; int s = rpl->status, rc; CTR3(KTR_CXGB, "%s: atid %u, status %u ", __func__, atid, s); free_atid(&td->tid_maps, atid); toep->tp_tid = -1; if (act_open_has_tid(s)) queue_tid_release(tod, GET_TID(rpl)); rc = act_open_rpl_status_to_errno(s); if (rc != EAGAIN) INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); toe_connect_failed(tod, inp, rc); toepcb_release(toep); /* unlocks inp */ if (rc != EAGAIN) INP_INFO_RUNLOCK(&V_tcbinfo); m_freem(m); return (0); } /* * Send an active open request. * * State of affairs on entry: * soisconnecting (so_state |= SS_ISCONNECTING) * tcbinfo not locked (this has changed - used to be WLOCKed) * inp WLOCKed * tp->t_state = TCPS_SYN_SENT * rtalloc1, RT_UNLOCK on rt. */ int t3_connect(struct toedev *tod, struct socket *so, struct rtentry *rt, struct sockaddr *nam) { struct mbuf *m = NULL; struct l2t_entry *e = NULL; struct tom_data *td = t3_tomdata(tod); struct adapter *sc = tod->tod_softc; struct cpl_act_open_req *cpl; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct toepcb *toep; int atid = -1, mtu_idx, rscale, cpu_idx, qset; struct sockaddr *gw; struct ifnet *ifp = rt->rt_ifp; struct port_info *pi = ifp->if_softc; /* XXX wrong for VLAN etc. */ INP_WLOCK_ASSERT(inp); toep = toepcb_alloc(tod); if (toep == NULL) goto failed; atid = alloc_atid(&td->tid_maps, toep); if (atid < 0) goto failed; qset = pi->first_qset + (arc4random() % pi->nqsets); m = M_GETHDR_OFLD(qset, CPL_PRIORITY_CONTROL, cpl); if (m == NULL) goto failed; gw = rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway : nam; e = t3_l2t_get(pi, ifp, gw); if (e == NULL) goto failed; toep->tp_l2t = e; toep->tp_tid = atid; /* used to double check response */ toep->tp_qset = qset; SOCKBUF_LOCK(&so->so_rcv); /* opt0 rcv_bufsiz initially, assumes its normal meaning later */ toep->tp_rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ); SOCKBUF_UNLOCK(&so->so_rcv); offload_socket(so, toep); /* * The kernel sets request_r_scale based on sb_max whereas we need to * take hardware's MAX_RCV_WND into account too. This is normally a * no-op as MAX_RCV_WND is much larger than the default sb_max. */ if (tp->t_flags & TF_REQ_SCALE) rscale = tp->request_r_scale = select_rcv_wscale(); else rscale = 0; mtu_idx = find_best_mtu_idx(sc, &inp->inp_inc, 0); cpu_idx = sc->rrss_map[qset]; cpl->wr.wrh_hi = htobe32(V_WR_OP(FW_WROPCODE_FORWARD)); cpl->wr.wrh_lo = 0; OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid)); inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port, &cpl->peer_ip, &cpl->peer_port); cpl->opt0h = calc_opt0h(so, mtu_idx, rscale, e); cpl->opt0l = calc_opt0l(so, toep->tp_rx_credits); cpl->params = 0; cpl->opt2 = calc_opt2(cpu_idx); CTR5(KTR_CXGB, "%s: atid %u (%s), toep %p, inp %p", __func__, toep->tp_tid, tcpstates[tp->t_state], toep, inp); if (l2t_send(sc, m, e) == 0) return (0); undo_offload_socket(so); failed: CTR5(KTR_CXGB, "%s: FAILED, atid %d, toep %p, l2te %p, mbuf %p", __func__, atid, toep, e, m); if (atid >= 0) free_atid(&td->tid_maps, atid); if (e) l2t_release(td->l2t, e); if (toep) toepcb_free(toep); m_freem(m); return (ENOMEM); } /* * Send an ABORT_REQ message. Cannot fail. This routine makes sure we do not * send multiple ABORT_REQs for the same connection and also that we do not try * to send a message after the connection has closed. */ static void send_reset(struct toepcb *toep) { struct cpl_abort_req *req; unsigned int tid = toep->tp_tid; struct inpcb *inp = toep->tp_inp; struct socket *so = inp->inp_socket; struct tcpcb *tp = intotcpcb(inp); struct toedev *tod = toep->tp_tod; struct adapter *sc = tod->tod_softc; struct mbuf *m; INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); CTR4(KTR_CXGB, "%s: tid %d, toep %p (%x)", __func__, tid, toep, toep->tp_flags); if (toep->tp_flags & TP_ABORT_SHUTDOWN) return; toep->tp_flags |= (TP_ABORT_RPL_PENDING | TP_ABORT_SHUTDOWN); /* Purge the send queue */ sbflush(so_sockbuf_snd(so)); purge_wr_queue(toep); m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_DATA, req); if (m == NULL) CXGB_UNIMPLEMENTED(); req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); req->wr.wrh_lo = htonl(V_WR_TID(tid)); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid)); req->rsvd0 = htonl(tp->snd_nxt); req->rsvd1 = !(toep->tp_flags & TP_DATASENT); req->cmd = CPL_ABORT_SEND_RST; if (tp->t_state == TCPS_SYN_SENT) (void )mbufq_enqueue(&toep->out_of_order_queue, m); /* defer */ else l2t_send(sc, m, toep->tp_l2t); } int t3_send_rst(struct toedev *tod __unused, struct tcpcb *tp) { send_reset(tp->t_toe); return (0); } /* * Handler for RX_DATA CPL messages. */ static int do_rx_data(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { struct adapter *sc = qs->adap; struct tom_data *td = sc->tom_softc; struct cpl_rx_data *hdr = mtod(m, void *); unsigned int tid = GET_TID(hdr); struct toepcb *toep = lookup_tid(&td->tid_maps, tid); struct inpcb *inp = toep->tp_inp; struct tcpcb *tp; struct socket *so; struct sockbuf *so_rcv; /* Advance over CPL */ m_adj(m, sizeof(*hdr)); /* XXX: revisit. This comes from the T4 TOM */ if (__predict_false(inp == NULL)) { /* * do_pass_establish failed and must be attempting to abort the * connection. Meanwhile, the T4 has sent us data for such a * connection. */ #ifdef notyet KASSERT(toepcb_flag(toep, TPF_ABORT_SHUTDOWN), ("%s: inp NULL and tid isn't being aborted", __func__)); #endif m_freem(m); return (0); } INP_WLOCK(inp); if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) { CTR4(KTR_CXGB, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", __func__, tid, m->m_pkthdr.len, inp->inp_flags); INP_WUNLOCK(inp); m_freem(m); return (0); } if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) toep->tp_delack_mode = hdr->dack_mode; tp = intotcpcb(inp); #ifdef INVARIANTS if (__predict_false(tp->rcv_nxt != be32toh(hdr->seq))) { log(LOG_ERR, "%s: unexpected seq# %x for TID %u, rcv_nxt %x\n", __func__, be32toh(hdr->seq), toep->tp_tid, tp->rcv_nxt); } #endif tp->rcv_nxt += m->m_pkthdr.len; KASSERT(tp->rcv_wnd >= m->m_pkthdr.len, ("%s: negative window size", __func__)); tp->rcv_wnd -= m->m_pkthdr.len; tp->t_rcvtime = ticks; so = inp->inp_socket; so_rcv = &so->so_rcv; SOCKBUF_LOCK(so_rcv); if (__predict_false(so_rcv->sb_state & SBS_CANTRCVMORE)) { CTR3(KTR_CXGB, "%s: tid %u, excess rx (%d bytes)", __func__, tid, m->m_pkthdr.len); SOCKBUF_UNLOCK(so_rcv); INP_WUNLOCK(inp); INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); tp = tcp_drop(tp, ECONNRESET); if (tp) INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); m_freem(m); return (0); } /* receive buffer autosize */ if (so_rcv->sb_flags & SB_AUTOSIZE && V_tcp_do_autorcvbuf && so_rcv->sb_hiwat < V_tcp_autorcvbuf_max && (m->m_pkthdr.len > (sbspace(so_rcv) / 8 * 7) || tp->rcv_wnd < 32768)) { unsigned int hiwat = so_rcv->sb_hiwat; unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc, V_tcp_autorcvbuf_max); if (!sbreserve_locked(so_rcv, newsize, so, NULL)) so_rcv->sb_flags &= ~SB_AUTOSIZE; else toep->tp_rx_credits += newsize - hiwat; } toep->tp_enqueued += m->m_pkthdr.len; sbappendstream_locked(so_rcv, m, 0); sorwakeup_locked(so); SOCKBUF_UNLOCK_ASSERT(so_rcv); INP_WUNLOCK(inp); return (0); } /* * Handler for PEER_CLOSE CPL messages. */ static int do_peer_close(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { struct adapter *sc = qs->adap; struct tom_data *td = sc->tom_softc; const struct cpl_peer_close *hdr = mtod(m, void *); unsigned int tid = GET_TID(hdr); struct toepcb *toep = lookup_tid(&td->tid_maps, tid); struct inpcb *inp = toep->tp_inp; struct tcpcb *tp; struct socket *so; INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); tp = intotcpcb(inp); CTR5(KTR_CXGB, "%s: tid %u (%s), toep_flags 0x%x, inp %p", __func__, tid, tp ? tcpstates[tp->t_state] : "no tp" , toep->tp_flags, inp); if (toep->tp_flags & TP_ABORT_RPL_PENDING) goto done; so = inp_inpcbtosocket(inp); socantrcvmore(so); tp->rcv_nxt++; switch (tp->t_state) { case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; /* FALLTHROUGH */ case TCPS_ESTABLISHED: tp->t_state = TCPS_CLOSE_WAIT; break; case TCPS_FIN_WAIT_1: tp->t_state = TCPS_CLOSING; break; case TCPS_FIN_WAIT_2: tcp_twstart(tp); INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ INP_INFO_RUNLOCK(&V_tcbinfo); INP_WLOCK(inp); toepcb_release(toep); /* no more CPLs expected */ m_freem(m); return (0); default: log(LOG_ERR, "%s: TID %u received PEER_CLOSE in bad state %d\n", __func__, toep->tp_tid, tp->t_state); } done: INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); m_freem(m); return (0); } /* * Handler for CLOSE_CON_RPL CPL messages. peer ACK to our FIN received. */ static int do_close_con_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { struct adapter *sc = qs->adap; struct tom_data *td = sc->tom_softc; const struct cpl_close_con_rpl *rpl = mtod(m, void *); unsigned int tid = GET_TID(rpl); struct toepcb *toep = lookup_tid(&td->tid_maps, tid); struct inpcb *inp = toep->tp_inp; struct tcpcb *tp; struct socket *so; INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); tp = intotcpcb(inp); CTR4(KTR_CXGB, "%s: tid %u (%s), toep_flags 0x%x", __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->tp_flags); if ((toep->tp_flags & TP_ABORT_RPL_PENDING)) goto done; so = inp_inpcbtosocket(inp); tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */ switch (tp->t_state) { case TCPS_CLOSING: tcp_twstart(tp); release: INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ INP_INFO_RUNLOCK(&V_tcbinfo); INP_WLOCK(inp); toepcb_release(toep); /* no more CPLs expected */ m_freem(m); return (0); case TCPS_LAST_ACK: if (tcp_close(tp)) INP_WUNLOCK(inp); goto release; case TCPS_FIN_WAIT_1: if (so->so_rcv.sb_state & SBS_CANTRCVMORE) soisdisconnected(so); tp->t_state = TCPS_FIN_WAIT_2; break; default: log(LOG_ERR, "%s: TID %u received CLOSE_CON_RPL in bad state %d\n", __func__, toep->tp_tid, tp->t_state); } done: INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); m_freem(m); return (0); } static int do_smt_write_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { struct cpl_smt_write_rpl *rpl = mtod(m, void *); if (rpl->status != CPL_ERR_NONE) { log(LOG_ERR, "Unexpected SMT_WRITE_RPL status %u for entry %u\n", rpl->status, GET_TID(rpl)); } m_freem(m); return (0); } static int do_set_tcb_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { struct cpl_set_tcb_rpl *rpl = mtod(m, void *); if (rpl->status != CPL_ERR_NONE) { log(LOG_ERR, "Unexpected SET_TCB_RPL status %u for tid %u\n", rpl->status, GET_TID(rpl)); } m_freem(m); return (0); } /* * Handle an ABORT_RPL_RSS CPL message. */ static int do_abort_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { struct adapter *sc = qs->adap; struct tom_data *td = sc->tom_softc; const struct cpl_abort_rpl_rss *rpl = mtod(m, void *); unsigned int tid = GET_TID(rpl); struct toepcb *toep = lookup_tid(&td->tid_maps, tid); struct inpcb *inp; /* * Ignore replies to post-close aborts indicating that the abort was * requested too late. These connections are terminated when we get * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss * arrives the TID is either no longer used or it has been recycled. */ if (rpl->status == CPL_ERR_ABORT_FAILED) { m_freem(m); return (0); } if (toep->tp_flags & TP_IS_A_SYNQ_ENTRY) return (do_abort_rpl_synqe(qs, r, m)); CTR4(KTR_CXGB, "%s: tid %d, toep %p, status %d", __func__, tid, toep, rpl->status); inp = toep->tp_inp; INP_WLOCK(inp); if (toep->tp_flags & TP_ABORT_RPL_PENDING) { if (!(toep->tp_flags & TP_ABORT_RPL_RCVD)) { toep->tp_flags |= TP_ABORT_RPL_RCVD; INP_WUNLOCK(inp); } else { toep->tp_flags &= ~TP_ABORT_RPL_RCVD; toep->tp_flags &= TP_ABORT_RPL_PENDING; toepcb_release(toep); /* no more CPLs expected */ } } m_freem(m); return (0); } /* * Convert the status code of an ABORT_REQ into a FreeBSD error code. */ static int abort_status_to_errno(struct tcpcb *tp, int abort_reason) { switch (abort_reason) { case CPL_ERR_BAD_SYN: case CPL_ERR_CONN_RESET: return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); case CPL_ERR_XMIT_TIMEDOUT: case CPL_ERR_PERSIST_TIMEDOUT: case CPL_ERR_FINWAIT2_TIMEDOUT: case CPL_ERR_KEEPALIVE_TIMEDOUT: return (ETIMEDOUT); default: return (EIO); } } /* * Returns whether an ABORT_REQ_RSS message is a negative advice. */ static inline int is_neg_adv_abort(unsigned int status) { return status == CPL_ERR_RTX_NEG_ADVICE || status == CPL_ERR_PERSIST_NEG_ADVICE; } void send_abort_rpl(struct toedev *tod, int tid, int qset) { struct mbuf *reply; struct cpl_abort_rpl *rpl; struct adapter *sc = tod->tod_softc; reply = M_GETHDR_OFLD(qset, CPL_PRIORITY_DATA, rpl); if (!reply) CXGB_UNIMPLEMENTED(); rpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL)); rpl->wr.wrh_lo = htonl(V_WR_TID(tid)); OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid)); rpl->cmd = CPL_ABORT_NO_RST; t3_offload_tx(sc, reply); } /* * Handle an ABORT_REQ_RSS CPL message. If we're waiting for an ABORT_RPL we * ignore this request except that we need to reply to it. */ static int do_abort_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { struct adapter *sc = qs->adap; struct tom_data *td = sc->tom_softc; struct toedev *tod = &td->tod; const struct cpl_abort_req_rss *req = mtod(m, void *); unsigned int tid = GET_TID(req); struct toepcb *toep = lookup_tid(&td->tid_maps, tid); struct inpcb *inp; struct tcpcb *tp; struct socket *so; int qset = toep->tp_qset; if (is_neg_adv_abort(req->status)) { CTR4(KTR_CXGB, "%s: negative advice %d for tid %u (%x)", __func__, req->status, tid, toep->tp_flags); m_freem(m); return (0); } if (toep->tp_flags & TP_IS_A_SYNQ_ENTRY) return (do_abort_req_synqe(qs, r, m)); inp = toep->tp_inp; INP_INFO_RLOCK(&V_tcbinfo); /* for tcp_close */ INP_WLOCK(inp); tp = intotcpcb(inp); so = inp->inp_socket; CTR6(KTR_CXGB, "%s: tid %u (%s), toep %p (%x), status %d", __func__, tid, tcpstates[tp->t_state], toep, toep->tp_flags, req->status); if (!(toep->tp_flags & TP_ABORT_REQ_RCVD)) { toep->tp_flags |= TP_ABORT_REQ_RCVD; toep->tp_flags |= TP_ABORT_SHUTDOWN; INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); m_freem(m); return (0); } toep->tp_flags &= ~TP_ABORT_REQ_RCVD; /* * If we'd sent a reset on this toep, we'll ignore this and clean up in * the T3's reply to our reset instead. */ if (toep->tp_flags & TP_ABORT_RPL_PENDING) { toep->tp_flags |= TP_ABORT_RPL_SENT; INP_WUNLOCK(inp); } else { so_error_set(so, abort_status_to_errno(tp, req->status)); tp = tcp_close(tp); if (tp == NULL) INP_WLOCK(inp); /* re-acquire */ toepcb_release(toep); /* no more CPLs expected */ } INP_INFO_RUNLOCK(&V_tcbinfo); send_abort_rpl(tod, tid, qset); m_freem(m); return (0); } static void assign_rxopt(struct tcpcb *tp, uint16_t tcpopt) { struct toepcb *toep = tp->t_toe; struct adapter *sc = toep->tp_tod->tod_softc; tp->t_maxseg = sc->params.mtus[G_TCPOPT_MSS(tcpopt)] - 40; if (G_TCPOPT_TSTAMP(tcpopt)) { tp->t_flags |= TF_RCVD_TSTMP; tp->t_flags |= TF_REQ_TSTMP; /* forcibly set */ tp->ts_recent = 0; /* XXX */ tp->ts_recent_age = tcp_ts_getticks(); } if (G_TCPOPT_SACK(tcpopt)) tp->t_flags |= TF_SACK_PERMIT; else tp->t_flags &= ~TF_SACK_PERMIT; if (G_TCPOPT_WSCALE_OK(tcpopt)) tp->t_flags |= TF_RCVD_SCALE; if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == (TF_RCVD_SCALE | TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; tp->snd_scale = G_TCPOPT_SND_WSCALE(tcpopt); } } /* * The ISS and IRS are from after the exchange of SYNs and are off by 1. */ void make_established(struct socket *so, uint32_t cpl_iss, uint32_t cpl_irs, uint16_t cpl_tcpopt) { struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct toepcb *toep = tp->t_toe; long bufsize; uint32_t iss = be32toh(cpl_iss) - 1; /* true ISS */ uint32_t irs = be32toh(cpl_irs) - 1; /* true IRS */ uint16_t tcpopt = be16toh(cpl_tcpopt); INP_WLOCK_ASSERT(inp); tp->t_state = TCPS_ESTABLISHED; tp->t_starttime = ticks; TCPSTAT_INC(tcps_connects); CTR4(KTR_CXGB, "%s tid %u, toep %p, inp %p", tcpstates[tp->t_state], toep->tp_tid, toep, inp); tp->irs = irs; tcp_rcvseqinit(tp); tp->rcv_wnd = toep->tp_rx_credits << 10; tp->rcv_adv += tp->rcv_wnd; tp->last_ack_sent = tp->rcv_nxt; /* * If we were unable to send all rx credits via opt0, save the remainder * in rx_credits so that they can be handed over with the next credit * update. */ SOCKBUF_LOCK(&so->so_rcv); bufsize = select_rcv_wnd(so); SOCKBUF_UNLOCK(&so->so_rcv); toep->tp_rx_credits = bufsize - tp->rcv_wnd; tp->iss = iss; tcp_sendseqinit(tp); tp->snd_una = iss + 1; tp->snd_nxt = iss + 1; tp->snd_max = iss + 1; assign_rxopt(tp, tcpopt); soisconnected(so); } /* * Fill in the right TID for CPL messages waiting in the out-of-order queue * and send them to the TOE. */ static void fixup_and_send_ofo(struct toepcb *toep) { struct mbuf *m; struct toedev *tod = toep->tp_tod; struct adapter *sc = tod->tod_softc; unsigned int tid = toep->tp_tid; inp_lock_assert(toep->tp_inp); while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) { struct ofld_hdr *oh = mtod(m, void *); /* * A variety of messages can be waiting but the fields we'll * be touching are common to all so any message type will do. */ struct cpl_close_con_req *p = (void *)(oh + 1); p->wr.wrh_lo = htonl(V_WR_TID(tid)); OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid)); t3_offload_tx(sc, m); } } /* * Process a CPL_ACT_ESTABLISH message. */ static int do_act_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { struct adapter *sc = qs->adap; struct tom_data *td = sc->tom_softc; struct cpl_act_establish *req = mtod(m, void *); unsigned int tid = GET_TID(req); unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); struct toepcb *toep = lookup_atid(&td->tid_maps, atid); struct inpcb *inp = toep->tp_inp; struct tcpcb *tp; struct socket *so; CTR3(KTR_CXGB, "%s: atid %u, tid %u", __func__, atid, tid); free_atid(&td->tid_maps, atid); INP_WLOCK(inp); tp = intotcpcb(inp); KASSERT(toep->tp_qset == qs->idx, ("%s qset mismatch %d %d", __func__, toep->tp_qset, qs->idx)); KASSERT(toep->tp_tid == atid, ("%s atid mismatch %d %d", __func__, toep->tp_tid, atid)); toep->tp_tid = tid; insert_tid(td, toep, tid); if (inp->inp_flags & INP_DROPPED) { /* socket closed by the kernel before hw told us it connected */ send_reset(toep); goto done; } KASSERT(tp->t_state == TCPS_SYN_SENT, ("TID %u expected TCPS_SYN_SENT, found %d.", tid, tp->t_state)); so = inp->inp_socket; make_established(so, req->snd_isn, req->rcv_isn, req->tcp_opt); /* * Now that we finally have a TID send any CPL messages that we had to * defer for lack of a TID. */ if (mbufq_len(&toep->out_of_order_queue)) fixup_and_send_ofo(toep); done: INP_WUNLOCK(inp); m_freem(m); return (0); } /* * Process an acknowledgment of WR completion. Advance snd_una and send the * next batch of work requests from the write queue. */ static void wr_ack(struct toepcb *toep, struct mbuf *m) { struct inpcb *inp = toep->tp_inp; struct tcpcb *tp; struct cpl_wr_ack *hdr = mtod(m, void *); struct socket *so; unsigned int credits = ntohs(hdr->credits); u32 snd_una = ntohl(hdr->snd_una); int bytes = 0; struct sockbuf *snd; struct mbuf *p; struct ofld_hdr *oh; inp_wlock(inp); tp = intotcpcb(inp); so = inp->inp_socket; toep->tp_wr_avail += credits; if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail) toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail; while (credits) { p = peek_wr(toep); if (__predict_false(!p)) { CTR5(KTR_CXGB, "%s: %u extra WR_ACK credits, " "tid %u, state %u, wr_avail %u", __func__, credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail); log(LOG_ERR, "%u WR_ACK credits for TID %u with " "nothing pending, state %u wr_avail=%u\n", credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail); break; } oh = mtod(p, struct ofld_hdr *); KASSERT(credits >= G_HDR_NDESC(oh->flags), ("%s: partial credits? %d %d", __func__, credits, G_HDR_NDESC(oh->flags))); dequeue_wr(toep); credits -= G_HDR_NDESC(oh->flags); bytes += oh->plen; if (oh->flags & F_HDR_SGL) sglist_free(oh->sgl); m_freem(p); } if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) goto out_free; if (tp->snd_una != snd_una) { tp->snd_una = snd_una; tp->ts_recent_age = tcp_ts_getticks(); if (tp->snd_una == tp->snd_nxt) toep->tp_flags &= ~TP_TX_WAIT_IDLE; } snd = so_sockbuf_snd(so); if (bytes) { SOCKBUF_LOCK(snd); sbdrop_locked(snd, bytes); so_sowwakeup_locked(so); } if (snd->sb_sndptroff < sbused(snd)) t3_push_frames(so, 0); out_free: inp_wunlock(tp->t_inpcb); m_freem(m); } /* * Handler for TX_DATA_ACK CPL messages. */ static int do_wr_ack(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { struct adapter *sc = qs->adap; struct tom_data *td = sc->tom_softc; struct cpl_wr_ack *hdr = mtod(m, void *); unsigned int tid = GET_TID(hdr); struct toepcb *toep = lookup_tid(&td->tid_maps, tid); /* XXX bad race */ if (toep) wr_ack(toep, m); return (0); } void t3_init_cpl_io(struct adapter *sc) { t3_register_cpl_handler(sc, CPL_ACT_ESTABLISH, do_act_establish); t3_register_cpl_handler(sc, CPL_ACT_OPEN_RPL, do_act_open_rpl); t3_register_cpl_handler(sc, CPL_RX_URG_NOTIFY, do_rx_urg_notify); t3_register_cpl_handler(sc, CPL_RX_DATA, do_rx_data); t3_register_cpl_handler(sc, CPL_TX_DMA_ACK, do_wr_ack); t3_register_cpl_handler(sc, CPL_PEER_CLOSE, do_peer_close); t3_register_cpl_handler(sc, CPL_ABORT_REQ_RSS, do_abort_req); t3_register_cpl_handler(sc, CPL_ABORT_RPL_RSS, do_abort_rpl); t3_register_cpl_handler(sc, CPL_CLOSE_CON_RPL, do_close_con_rpl); t3_register_cpl_handler(sc, CPL_SMT_WRITE_RPL, do_smt_write_rpl); t3_register_cpl_handler(sc, CPL_SET_TCB_RPL, do_set_tcb_rpl); } #endif Index: stable/11/sys/dev/cxgbe/tom/t4_tom.c =================================================================== --- stable/11/sys/dev/cxgbe/tom/t4_tom.c (revision 330302) +++ stable/11/sys/dev/cxgbe/tom/t4_tom.c (revision 330303) @@ -1,1272 +1,1271 @@ /*- * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include +#include #include #include #ifdef TCP_OFFLOAD #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "common/t4_tcb.h" #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" static struct protosw toe_protosw; static struct pr_usrreqs toe_usrreqs; static struct protosw toe6_protosw; static struct pr_usrreqs toe6_usrreqs; /* Module ops */ static int t4_tom_mod_load(void); static int t4_tom_mod_unload(void); static int t4_tom_modevent(module_t, int, void *); /* ULD ops and helpers */ static int t4_tom_activate(struct adapter *); static int t4_tom_deactivate(struct adapter *); static struct uld_info tom_uld_info = { .uld_id = ULD_TOM, .activate = t4_tom_activate, .deactivate = t4_tom_deactivate, }; static void queue_tid_release(struct adapter *, int); static void release_offload_resources(struct toepcb *); static int alloc_tid_tabs(struct tid_info *); static void free_tid_tabs(struct tid_info *); static int add_lip(struct adapter *, struct in6_addr *); static int delete_lip(struct adapter *, struct in6_addr *); static struct clip_entry *search_lip(struct tom_data *, struct in6_addr *); static void init_clip_table(struct adapter *, struct tom_data *); static void update_clip(struct adapter *, void *); static void t4_clip_task(void *, int); static void update_clip_table(struct adapter *, struct tom_data *); static void destroy_clip_table(struct adapter *, struct tom_data *); static void free_tom_data(struct adapter *, struct tom_data *); static void reclaim_wr_resources(void *, int); static int in6_ifaddr_gen; static eventhandler_tag ifaddr_evhandler; static struct timeout_task clip_task; struct toepcb * alloc_toepcb(struct vi_info *vi, int txqid, int rxqid, int flags) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct toepcb *toep; int tx_credits, txsd_total, len; /* * The firmware counts tx work request credits in units of 16 bytes * each. Reserve room for an ABORT_REQ so the driver never has to worry * about tx credits if it wants to abort a connection. */ tx_credits = sc->params.ofldq_wr_cred; tx_credits -= howmany(sizeof(struct cpl_abort_req), 16); /* * Shortest possible tx work request is a fw_ofld_tx_data_wr + 1 byte * immediate payload, and firmware counts tx work request credits in * units of 16 byte. Calculate the maximum work requests possible. */ txsd_total = tx_credits / howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16); if (txqid < 0) txqid = (arc4random() % vi->nofldtxq) + vi->first_ofld_txq; KASSERT(txqid >= vi->first_ofld_txq && txqid < vi->first_ofld_txq + vi->nofldtxq, ("%s: txqid %d for vi %p (first %d, n %d)", __func__, txqid, vi, vi->first_ofld_txq, vi->nofldtxq)); if (rxqid < 0) rxqid = (arc4random() % vi->nofldrxq) + vi->first_ofld_rxq; KASSERT(rxqid >= vi->first_ofld_rxq && rxqid < vi->first_ofld_rxq + vi->nofldrxq, ("%s: rxqid %d for vi %p (first %d, n %d)", __func__, rxqid, vi, vi->first_ofld_rxq, vi->nofldrxq)); len = offsetof(struct toepcb, txsd) + txsd_total * sizeof(struct ofld_tx_sdesc); toep = malloc(len, M_CXGBE, M_ZERO | flags); if (toep == NULL) return (NULL); refcount_init(&toep->refcount, 1); toep->td = sc->tom_softc; toep->vi = vi; toep->tx_total = tx_credits; toep->tx_credits = tx_credits; toep->ofld_txq = &sc->sge.ofld_txq[txqid]; toep->ofld_rxq = &sc->sge.ofld_rxq[rxqid]; toep->ctrlq = &sc->sge.ctrlq[pi->port_id]; mbufq_init(&toep->ulp_pduq, INT_MAX); mbufq_init(&toep->ulp_pdu_reclaimq, INT_MAX); toep->txsd_total = txsd_total; toep->txsd_avail = txsd_total; toep->txsd_pidx = 0; toep->txsd_cidx = 0; aiotx_init_toep(toep); ddp_init_toep(toep); return (toep); } struct toepcb * hold_toepcb(struct toepcb *toep) { refcount_acquire(&toep->refcount); return (toep); } void free_toepcb(struct toepcb *toep) { if (refcount_release(&toep->refcount) == 0) return; KASSERT(!(toep->flags & TPF_ATTACHED), ("%s: attached to an inpcb", __func__)); KASSERT(!(toep->flags & TPF_CPL_PENDING), ("%s: CPL pending", __func__)); ddp_uninit_toep(toep); free(toep, M_CXGBE); } /* * Set up the socket for TCP offload. */ void offload_socket(struct socket *so, struct toepcb *toep) { struct tom_data *td = toep->td; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct sockbuf *sb; INP_WLOCK_ASSERT(inp); /* Update socket */ sb = &so->so_snd; SOCKBUF_LOCK(sb); sb->sb_flags |= SB_NOCOALESCE; SOCKBUF_UNLOCK(sb); sb = &so->so_rcv; SOCKBUF_LOCK(sb); sb->sb_flags |= SB_NOCOALESCE; if (inp->inp_vflag & INP_IPV6) so->so_proto = &toe6_protosw; else so->so_proto = &toe_protosw; SOCKBUF_UNLOCK(sb); /* Update TCP PCB */ tp->tod = &td->tod; tp->t_toe = toep; tp->t_flags |= TF_TOE; /* Install an extra hold on inp */ toep->inp = inp; toep->flags |= TPF_ATTACHED; in_pcbref(inp); /* Add the TOE PCB to the active list */ mtx_lock(&td->toep_list_lock); TAILQ_INSERT_HEAD(&td->toep_list, toep, link); mtx_unlock(&td->toep_list_lock); } /* This is _not_ the normal way to "unoffload" a socket. */ void undo_offload_socket(struct socket *so) { struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct toepcb *toep = tp->t_toe; struct tom_data *td = toep->td; struct sockbuf *sb; INP_WLOCK_ASSERT(inp); sb = &so->so_snd; SOCKBUF_LOCK(sb); sb->sb_flags &= ~SB_NOCOALESCE; SOCKBUF_UNLOCK(sb); sb = &so->so_rcv; SOCKBUF_LOCK(sb); sb->sb_flags &= ~SB_NOCOALESCE; SOCKBUF_UNLOCK(sb); tp->tod = NULL; tp->t_toe = NULL; tp->t_flags &= ~TF_TOE; toep->inp = NULL; toep->flags &= ~TPF_ATTACHED; if (in_pcbrele_wlocked(inp)) panic("%s: inp freed.", __func__); mtx_lock(&td->toep_list_lock); TAILQ_REMOVE(&td->toep_list, toep, link); mtx_unlock(&td->toep_list_lock); } static void release_offload_resources(struct toepcb *toep) { struct tom_data *td = toep->td; struct adapter *sc = td_adapter(td); int tid = toep->tid; KASSERT(!(toep->flags & TPF_CPL_PENDING), ("%s: %p has CPL pending.", __func__, toep)); KASSERT(!(toep->flags & TPF_ATTACHED), ("%s: %p is still attached.", __func__, toep)); CTR5(KTR_CXGBE, "%s: toep %p (tid %d, l2te %p, ce %p)", __func__, toep, tid, toep->l2te, toep->ce); /* * These queues should have been emptied at approximately the same time * that a normal connection's socket's so_snd would have been purged or * drained. Do _not_ clean up here. */ MPASS(mbufq_len(&toep->ulp_pduq) == 0); MPASS(mbufq_len(&toep->ulp_pdu_reclaimq) == 0); #ifdef INVARIANTS ddp_assert_empty(toep); #endif if (toep->l2te) t4_l2t_release(toep->l2te); if (tid >= 0) { remove_tid(sc, tid, toep->ce ? 2 : 1); release_tid(sc, tid, toep->ctrlq); } if (toep->ce) release_lip(td, toep->ce); mtx_lock(&td->toep_list_lock); TAILQ_REMOVE(&td->toep_list, toep, link); mtx_unlock(&td->toep_list_lock); free_toepcb(toep); } /* * The kernel is done with the TCP PCB and this is our opportunity to unhook the * toepcb hanging off of it. If the TOE driver is also done with the toepcb (no * pending CPL) then it is time to release all resources tied to the toepcb. * * Also gets called when an offloaded active open fails and the TOM wants the * kernel to take the TCP PCB back. */ static void t4_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp) { #if defined(KTR) || defined(INVARIANTS) struct inpcb *inp = tp->t_inpcb; #endif struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(inp); KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); KASSERT(toep->flags & TPF_ATTACHED, ("%s: not attached", __func__)); #ifdef KTR if (tp->t_state == TCPS_SYN_SENT) { CTR6(KTR_CXGBE, "%s: atid %d, toep %p (0x%x), inp %p (0x%x)", __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags); } else { CTR6(KTR_CXGBE, "t4_pcb_detach: tid %d (%s), toep %p (0x%x), inp %p (0x%x)", toep->tid, tcpstates[tp->t_state], toep, toep->flags, inp, inp->inp_flags); } #endif tp->t_toe = NULL; tp->t_flags &= ~TF_TOE; toep->flags &= ~TPF_ATTACHED; if (!(toep->flags & TPF_CPL_PENDING)) release_offload_resources(toep); } /* * setsockopt handler. */ static void t4_ctloutput(struct toedev *tod, struct tcpcb *tp, int dir, int name) { struct adapter *sc = tod->tod_softc; struct toepcb *toep = tp->t_toe; if (dir == SOPT_GET) return; CTR4(KTR_CXGBE, "%s: tp %p, dir %u, name %u", __func__, tp, dir, name); switch (name) { case TCP_NODELAY: t4_set_tcb_field(sc, toep->ctrlq, toep->tid, W_TCB_T_FLAGS, V_TF_NAGLE(1), V_TF_NAGLE(tp->t_flags & TF_NODELAY ? 0 : 1), 0, 0, toep->ofld_rxq->iq.abs_id); break; default: break; } } /* * The TOE driver will not receive any more CPLs for the tid associated with the * toepcb; release the hold on the inpcb. */ void final_cpl_received(struct toepcb *toep) { struct inpcb *inp = toep->inp; KASSERT(inp != NULL, ("%s: inp is NULL", __func__)); INP_WLOCK_ASSERT(inp); KASSERT(toep->flags & TPF_CPL_PENDING, ("%s: CPL not pending already?", __func__)); CTR6(KTR_CXGBE, "%s: tid %d, toep %p (0x%x), inp %p (0x%x)", __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags); if (toep->ulp_mode == ULP_MODE_TCPDDP) release_ddp_resources(toep); toep->inp = NULL; toep->flags &= ~TPF_CPL_PENDING; mbufq_drain(&toep->ulp_pdu_reclaimq); if (!(toep->flags & TPF_ATTACHED)) release_offload_resources(toep); if (!in_pcbrele_wlocked(inp)) INP_WUNLOCK(inp); } void insert_tid(struct adapter *sc, int tid, void *ctx, int ntids) { struct tid_info *t = &sc->tids; t->tid_tab[tid] = ctx; atomic_add_int(&t->tids_in_use, ntids); } void * lookup_tid(struct adapter *sc, int tid) { struct tid_info *t = &sc->tids; return (t->tid_tab[tid]); } void update_tid(struct adapter *sc, int tid, void *ctx) { struct tid_info *t = &sc->tids; t->tid_tab[tid] = ctx; } void remove_tid(struct adapter *sc, int tid, int ntids) { struct tid_info *t = &sc->tids; t->tid_tab[tid] = NULL; atomic_subtract_int(&t->tids_in_use, ntids); } void release_tid(struct adapter *sc, int tid, struct sge_wrq *ctrlq) { struct wrqe *wr; struct cpl_tid_release *req; wr = alloc_wrqe(sizeof(*req), ctrlq); if (wr == NULL) { queue_tid_release(sc, tid); /* defer */ return; } req = wrtod(wr); INIT_TP_WR_MIT_CPL(req, CPL_TID_RELEASE, tid); t4_wrq_tx(sc, wr); } static void queue_tid_release(struct adapter *sc, int tid) { CXGBE_UNIMPLEMENTED("deferred tid release"); } /* * What mtu_idx to use, given a 4-tuple and/or an MSS cap */ int find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, int pmss) { unsigned short *mtus = &sc->params.mtus[0]; int i, mss, n; KASSERT(inc != NULL || pmss > 0, ("%s: at least one of inc/pmss must be specified", __func__)); mss = inc ? tcp_mssopt(inc) : pmss; if (pmss > 0 && mss > pmss) mss = pmss; if (inc->inc_flags & INC_ISIPV6) n = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else n = sizeof(struct ip) + sizeof(struct tcphdr); for (i = 0; i < NMTUS - 1 && mtus[i + 1] <= mss + n; i++) continue; return (i); } /* * Determine the receive window size for a socket. */ u_long select_rcv_wnd(struct socket *so) { unsigned long wnd; SOCKBUF_LOCK_ASSERT(&so->so_rcv); wnd = sbspace(&so->so_rcv); if (wnd < MIN_RCV_WND) wnd = MIN_RCV_WND; return min(wnd, MAX_RCV_WND); } int select_rcv_wscale(void) { int wscale = 0; unsigned long space = sb_max; if (space > MAX_RCV_WND) space = MAX_RCV_WND; while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space) wscale++; return (wscale); } -extern int always_keepalive; - /* * socket so could be a listening socket too. */ uint64_t calc_opt0(struct socket *so, struct vi_info *vi, struct l2t_entry *e, int mtu_idx, int rscale, int rx_credits, int ulp_mode) { uint64_t opt0; KASSERT(rx_credits <= M_RCV_BUFSIZ, ("%s: rcv_bufsiz too high", __func__)); opt0 = F_TCAM_BYPASS | V_WND_SCALE(rscale) | V_MSS_IDX(mtu_idx) | V_ULP_MODE(ulp_mode) | V_RCV_BUFSIZ(rx_credits); if (so != NULL) { struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); - int keepalive = always_keepalive || + int keepalive = tcp_always_keepalive || so_options_get(so) & SO_KEEPALIVE; opt0 |= V_NAGLE((tp->t_flags & TF_NODELAY) == 0); opt0 |= V_KEEP_ALIVE(keepalive != 0); } if (e != NULL) opt0 |= V_L2T_IDX(e->idx); if (vi != NULL) { opt0 |= V_SMAC_SEL(vi->smt_idx); opt0 |= V_TX_CHAN(vi->pi->tx_chan); } return htobe64(opt0); } uint64_t select_ntuple(struct vi_info *vi, struct l2t_entry *e) { struct adapter *sc = vi->pi->adapter; struct tp_params *tp = &sc->params.tp; uint16_t viid = vi->viid; uint64_t ntuple = 0; /* * Initialize each of the fields which we care about which are present * in the Compressed Filter Tuple. */ if (tp->vlan_shift >= 0 && e->vlan != CPL_L2T_VLAN_NONE) ntuple |= (uint64_t)(F_FT_VLAN_VLD | e->vlan) << tp->vlan_shift; if (tp->port_shift >= 0) ntuple |= (uint64_t)e->lport << tp->port_shift; if (tp->protocol_shift >= 0) ntuple |= (uint64_t)IPPROTO_TCP << tp->protocol_shift; if (tp->vnic_shift >= 0) { uint32_t vf = G_FW_VIID_VIN(viid); uint32_t pf = G_FW_VIID_PFN(viid); uint32_t vld = G_FW_VIID_VIVLD(viid); ntuple |= (uint64_t)(V_FT_VNID_ID_VF(vf) | V_FT_VNID_ID_PF(pf) | V_FT_VNID_ID_VLD(vld)) << tp->vnic_shift; } if (is_t4(sc)) return (htobe32((uint32_t)ntuple)); else return (htobe64(V_FILTER_TUPLE(ntuple))); } void set_tcpddp_ulp_mode(struct toepcb *toep) { toep->ulp_mode = ULP_MODE_TCPDDP; toep->ddp_flags = DDP_OK; } int negative_advice(int status) { return (status == CPL_ERR_RTX_NEG_ADVICE || status == CPL_ERR_PERSIST_NEG_ADVICE || status == CPL_ERR_KEEPALV_NEG_ADVICE); } static int alloc_tid_tabs(struct tid_info *t) { size_t size; unsigned int i; size = t->ntids * sizeof(*t->tid_tab) + t->natids * sizeof(*t->atid_tab) + t->nstids * sizeof(*t->stid_tab); t->tid_tab = malloc(size, M_CXGBE, M_ZERO | M_NOWAIT); if (t->tid_tab == NULL) return (ENOMEM); mtx_init(&t->atid_lock, "atid lock", NULL, MTX_DEF); t->atid_tab = (union aopen_entry *)&t->tid_tab[t->ntids]; t->afree = t->atid_tab; t->atids_in_use = 0; for (i = 1; i < t->natids; i++) t->atid_tab[i - 1].next = &t->atid_tab[i]; t->atid_tab[t->natids - 1].next = NULL; mtx_init(&t->stid_lock, "stid lock", NULL, MTX_DEF); t->stid_tab = (struct listen_ctx **)&t->atid_tab[t->natids]; t->stids_in_use = 0; TAILQ_INIT(&t->stids); t->nstids_free_head = t->nstids; atomic_store_rel_int(&t->tids_in_use, 0); return (0); } static void free_tid_tabs(struct tid_info *t) { KASSERT(t->tids_in_use == 0, ("%s: %d tids still in use.", __func__, t->tids_in_use)); KASSERT(t->atids_in_use == 0, ("%s: %d atids still in use.", __func__, t->atids_in_use)); KASSERT(t->stids_in_use == 0, ("%s: %d tids still in use.", __func__, t->stids_in_use)); free(t->tid_tab, M_CXGBE); t->tid_tab = NULL; if (mtx_initialized(&t->atid_lock)) mtx_destroy(&t->atid_lock); if (mtx_initialized(&t->stid_lock)) mtx_destroy(&t->stid_lock); } static int add_lip(struct adapter *sc, struct in6_addr *lip) { struct fw_clip_cmd c; ASSERT_SYNCHRONIZED_OP(sc); /* mtx_assert(&td->clip_table_lock, MA_OWNED); */ memset(&c, 0, sizeof(c)); c.op_to_write = htonl(V_FW_CMD_OP(FW_CLIP_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE); c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_ALLOC | FW_LEN16(c)); c.ip_hi = *(uint64_t *)&lip->s6_addr[0]; c.ip_lo = *(uint64_t *)&lip->s6_addr[8]; return (-t4_wr_mbox_ns(sc, sc->mbox, &c, sizeof(c), &c)); } static int delete_lip(struct adapter *sc, struct in6_addr *lip) { struct fw_clip_cmd c; ASSERT_SYNCHRONIZED_OP(sc); /* mtx_assert(&td->clip_table_lock, MA_OWNED); */ memset(&c, 0, sizeof(c)); c.op_to_write = htonl(V_FW_CMD_OP(FW_CLIP_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ); c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_FREE | FW_LEN16(c)); c.ip_hi = *(uint64_t *)&lip->s6_addr[0]; c.ip_lo = *(uint64_t *)&lip->s6_addr[8]; return (-t4_wr_mbox_ns(sc, sc->mbox, &c, sizeof(c), &c)); } static struct clip_entry * search_lip(struct tom_data *td, struct in6_addr *lip) { struct clip_entry *ce; mtx_assert(&td->clip_table_lock, MA_OWNED); TAILQ_FOREACH(ce, &td->clip_table, link) { if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip)) return (ce); } return (NULL); } struct clip_entry * hold_lip(struct tom_data *td, struct in6_addr *lip, struct clip_entry *ce) { mtx_lock(&td->clip_table_lock); if (ce == NULL) ce = search_lip(td, lip); if (ce != NULL) ce->refcount++; mtx_unlock(&td->clip_table_lock); return (ce); } void release_lip(struct tom_data *td, struct clip_entry *ce) { mtx_lock(&td->clip_table_lock); KASSERT(search_lip(td, &ce->lip) == ce, ("%s: CLIP entry %p p not in CLIP table.", __func__, ce)); KASSERT(ce->refcount > 0, ("%s: CLIP entry %p has refcount 0", __func__, ce)); --ce->refcount; mtx_unlock(&td->clip_table_lock); } static void init_clip_table(struct adapter *sc, struct tom_data *td) { ASSERT_SYNCHRONIZED_OP(sc); mtx_init(&td->clip_table_lock, "CLIP table lock", NULL, MTX_DEF); TAILQ_INIT(&td->clip_table); td->clip_gen = -1; update_clip_table(sc, td); } static void update_clip(struct adapter *sc, void *arg __unused) { if (begin_synchronized_op(sc, NULL, HOLD_LOCK, "t4tomuc")) return; if (uld_active(sc, ULD_TOM)) update_clip_table(sc, sc->tom_softc); end_synchronized_op(sc, LOCK_HELD); } static void t4_clip_task(void *arg, int count) { t4_iterate(update_clip, NULL); } static void update_clip_table(struct adapter *sc, struct tom_data *td) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; struct in6_addr *lip, tlip; struct clip_head stale; struct clip_entry *ce, *ce_temp; struct vi_info *vi; int rc, gen, i, j; uintptr_t last_vnet; ASSERT_SYNCHRONIZED_OP(sc); IN6_IFADDR_RLOCK(&in6_ifa_tracker); mtx_lock(&td->clip_table_lock); gen = atomic_load_acq_int(&in6_ifaddr_gen); if (gen == td->clip_gen) goto done; TAILQ_INIT(&stale); TAILQ_CONCAT(&stale, &td->clip_table, link); /* * last_vnet optimizes the common cases where all if_vnet = NULL (no * VIMAGE) or all if_vnet = vnet0. */ last_vnet = (uintptr_t)(-1); for_each_port(sc, i) for_each_vi(sc->port[i], j, vi) { if (last_vnet == (uintptr_t)vi->ifp->if_vnet) continue; /* XXX: races with if_vmove */ CURVNET_SET(vi->ifp->if_vnet); TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { lip = &ia->ia_addr.sin6_addr; KASSERT(!IN6_IS_ADDR_MULTICAST(lip), ("%s: mcast address in in6_ifaddr list", __func__)); if (IN6_IS_ADDR_LOOPBACK(lip)) continue; if (IN6_IS_SCOPE_EMBED(lip)) { /* Remove the embedded scope */ tlip = *lip; lip = &tlip; in6_clearscope(lip); } /* * XXX: how to weed out the link local address for the * loopback interface? It's fe80::1 usually (always?). */ /* * If it's in the main list then we already know it's * not stale. */ TAILQ_FOREACH(ce, &td->clip_table, link) { if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip)) goto next; } /* * If it's in the stale list we should move it to the * main list. */ TAILQ_FOREACH(ce, &stale, link) { if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip)) { TAILQ_REMOVE(&stale, ce, link); TAILQ_INSERT_TAIL(&td->clip_table, ce, link); goto next; } } /* A new IP6 address; add it to the CLIP table */ ce = malloc(sizeof(*ce), M_CXGBE, M_NOWAIT); memcpy(&ce->lip, lip, sizeof(ce->lip)); ce->refcount = 0; rc = add_lip(sc, lip); if (rc == 0) TAILQ_INSERT_TAIL(&td->clip_table, ce, link); else { char ip[INET6_ADDRSTRLEN]; inet_ntop(AF_INET6, &ce->lip, &ip[0], sizeof(ip)); log(LOG_ERR, "%s: could not add %s (%d)\n", __func__, ip, rc); free(ce, M_CXGBE); } next: continue; } CURVNET_RESTORE(); last_vnet = (uintptr_t)vi->ifp->if_vnet; } /* * Remove stale addresses (those no longer in V_in6_ifaddrhead) that are * no longer referenced by the driver. */ TAILQ_FOREACH_SAFE(ce, &stale, link, ce_temp) { if (ce->refcount == 0) { rc = delete_lip(sc, &ce->lip); if (rc == 0) { TAILQ_REMOVE(&stale, ce, link); free(ce, M_CXGBE); } else { char ip[INET6_ADDRSTRLEN]; inet_ntop(AF_INET6, &ce->lip, &ip[0], sizeof(ip)); log(LOG_ERR, "%s: could not delete %s (%d)\n", __func__, ip, rc); } } } /* The ones that are still referenced need to stay in the CLIP table */ TAILQ_CONCAT(&td->clip_table, &stale, link); td->clip_gen = gen; done: mtx_unlock(&td->clip_table_lock); IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); } static void destroy_clip_table(struct adapter *sc, struct tom_data *td) { struct clip_entry *ce, *ce_temp; if (mtx_initialized(&td->clip_table_lock)) { mtx_lock(&td->clip_table_lock); TAILQ_FOREACH_SAFE(ce, &td->clip_table, link, ce_temp) { KASSERT(ce->refcount == 0, ("%s: CLIP entry %p still in use (%d)", __func__, ce, ce->refcount)); TAILQ_REMOVE(&td->clip_table, ce, link); delete_lip(sc, &ce->lip); free(ce, M_CXGBE); } mtx_unlock(&td->clip_table_lock); mtx_destroy(&td->clip_table_lock); } } static void free_tom_data(struct adapter *sc, struct tom_data *td) { ASSERT_SYNCHRONIZED_OP(sc); KASSERT(TAILQ_EMPTY(&td->toep_list), ("%s: TOE PCB list is not empty.", __func__)); KASSERT(td->lctx_count == 0, ("%s: lctx hash table is not empty.", __func__)); t4_free_ppod_region(&td->pr); destroy_clip_table(sc, td); if (td->listen_mask != 0) hashdestroy(td->listen_hash, M_CXGBE, td->listen_mask); if (mtx_initialized(&td->unsent_wr_lock)) mtx_destroy(&td->unsent_wr_lock); if (mtx_initialized(&td->lctx_hash_lock)) mtx_destroy(&td->lctx_hash_lock); if (mtx_initialized(&td->toep_list_lock)) mtx_destroy(&td->toep_list_lock); free_tid_tabs(&sc->tids); free(td, M_CXGBE); } static void reclaim_wr_resources(void *arg, int count) { struct tom_data *td = arg; STAILQ_HEAD(, wrqe) twr_list = STAILQ_HEAD_INITIALIZER(twr_list); struct cpl_act_open_req *cpl; u_int opcode, atid; struct wrqe *wr; struct adapter *sc; mtx_lock(&td->unsent_wr_lock); STAILQ_SWAP(&td->unsent_wr_list, &twr_list, wrqe); mtx_unlock(&td->unsent_wr_lock); while ((wr = STAILQ_FIRST(&twr_list)) != NULL) { STAILQ_REMOVE_HEAD(&twr_list, link); cpl = wrtod(wr); opcode = GET_OPCODE(cpl); switch (opcode) { case CPL_ACT_OPEN_REQ: case CPL_ACT_OPEN_REQ6: atid = G_TID_TID(be32toh(OPCODE_TID(cpl))); sc = td_adapter(td); CTR2(KTR_CXGBE, "%s: atid %u ", __func__, atid); act_open_failure_cleanup(sc, atid, EHOSTUNREACH); free(wr, M_CXGBE); break; default: log(LOG_ERR, "%s: leaked work request %p, wr_len %d, " "opcode %x\n", __func__, wr, wr->wr_len, opcode); /* WR not freed here; go look at it with a debugger. */ } } } /* * Ground control to Major TOM * Commencing countdown, engines on */ static int t4_tom_activate(struct adapter *sc) { struct tom_data *td; struct toedev *tod; struct vi_info *vi; struct sge_ofld_rxq *ofld_rxq; int i, j, rc, v; ASSERT_SYNCHRONIZED_OP(sc); /* per-adapter softc for TOM */ td = malloc(sizeof(*td), M_CXGBE, M_ZERO | M_NOWAIT); if (td == NULL) return (ENOMEM); /* List of TOE PCBs and associated lock */ mtx_init(&td->toep_list_lock, "PCB list lock", NULL, MTX_DEF); TAILQ_INIT(&td->toep_list); /* Listen context */ mtx_init(&td->lctx_hash_lock, "lctx hash lock", NULL, MTX_DEF); td->listen_hash = hashinit_flags(LISTEN_HASH_SIZE, M_CXGBE, &td->listen_mask, HASH_NOWAIT); /* List of WRs for which L2 resolution failed */ mtx_init(&td->unsent_wr_lock, "Unsent WR list lock", NULL, MTX_DEF); STAILQ_INIT(&td->unsent_wr_list); TASK_INIT(&td->reclaim_wr_resources, 0, reclaim_wr_resources, td); /* TID tables */ rc = alloc_tid_tabs(&sc->tids); if (rc != 0) goto done; rc = t4_init_ppod_region(&td->pr, &sc->vres.ddp, t4_read_reg(sc, A_ULP_RX_TDDP_PSZ), "TDDP page pods"); if (rc != 0) goto done; t4_set_reg_field(sc, A_ULP_RX_TDDP_TAGMASK, V_TDDPTAGMASK(M_TDDPTAGMASK), td->pr.pr_tag_mask); /* CLIP table for IPv6 offload */ init_clip_table(sc, td); /* toedev ops */ tod = &td->tod; init_toedev(tod); tod->tod_softc = sc; tod->tod_connect = t4_connect; tod->tod_listen_start = t4_listen_start; tod->tod_listen_stop = t4_listen_stop; tod->tod_rcvd = t4_rcvd; tod->tod_output = t4_tod_output; tod->tod_send_rst = t4_send_rst; tod->tod_send_fin = t4_send_fin; tod->tod_pcb_detach = t4_pcb_detach; tod->tod_l2_update = t4_l2_update; tod->tod_syncache_added = t4_syncache_added; tod->tod_syncache_removed = t4_syncache_removed; tod->tod_syncache_respond = t4_syncache_respond; tod->tod_offload_socket = t4_offload_socket; tod->tod_ctloutput = t4_ctloutput; for_each_port(sc, i) { for_each_vi(sc->port[i], v, vi) { TOEDEV(vi->ifp) = &td->tod; for_each_ofld_rxq(vi, j, ofld_rxq) { ofld_rxq->iq.set_tcb_rpl = do_set_tcb_rpl; ofld_rxq->iq.l2t_write_rpl = do_l2t_write_rpl2; } } } sc->tom_softc = td; register_toedev(sc->tom_softc); done: if (rc != 0) free_tom_data(sc, td); return (rc); } static int t4_tom_deactivate(struct adapter *sc) { int rc = 0; struct tom_data *td = sc->tom_softc; ASSERT_SYNCHRONIZED_OP(sc); if (td == NULL) return (0); /* XXX. KASSERT? */ if (sc->offload_map != 0) return (EBUSY); /* at least one port has IFCAP_TOE enabled */ if (uld_active(sc, ULD_IWARP) || uld_active(sc, ULD_ISCSI)) return (EBUSY); /* both iWARP and iSCSI rely on the TOE. */ mtx_lock(&td->toep_list_lock); if (!TAILQ_EMPTY(&td->toep_list)) rc = EBUSY; mtx_unlock(&td->toep_list_lock); mtx_lock(&td->lctx_hash_lock); if (td->lctx_count > 0) rc = EBUSY; mtx_unlock(&td->lctx_hash_lock); taskqueue_drain(taskqueue_thread, &td->reclaim_wr_resources); mtx_lock(&td->unsent_wr_lock); if (!STAILQ_EMPTY(&td->unsent_wr_list)) rc = EBUSY; mtx_unlock(&td->unsent_wr_lock); if (rc == 0) { unregister_toedev(sc->tom_softc); free_tom_data(sc, td); sc->tom_softc = NULL; } return (rc); } static void t4_tom_ifaddr_event(void *arg __unused, struct ifnet *ifp) { atomic_add_rel_int(&in6_ifaddr_gen, 1); taskqueue_enqueue_timeout(taskqueue_thread, &clip_task, -hz / 4); } static int t4_aio_queue_tom(struct socket *so, struct kaiocb *job) { struct tcpcb *tp = so_sototcpcb(so); struct toepcb *toep = tp->t_toe; int error; if (toep->ulp_mode == ULP_MODE_TCPDDP) { error = t4_aio_queue_ddp(so, job); if (error != EOPNOTSUPP) return (error); } return (t4_aio_queue_aiotx(so, job)); } static int t4_tom_mod_load(void) { int rc; struct protosw *tcp_protosw, *tcp6_protosw; /* CPL handlers */ t4_init_connect_cpl_handlers(); t4_init_listen_cpl_handlers(); t4_init_cpl_io_handlers(); rc = t4_ddp_mod_load(); if (rc != 0) return (rc); tcp_protosw = pffindproto(PF_INET, IPPROTO_TCP, SOCK_STREAM); if (tcp_protosw == NULL) return (ENOPROTOOPT); bcopy(tcp_protosw, &toe_protosw, sizeof(toe_protosw)); bcopy(tcp_protosw->pr_usrreqs, &toe_usrreqs, sizeof(toe_usrreqs)); toe_usrreqs.pru_aio_queue = t4_aio_queue_tom; toe_protosw.pr_usrreqs = &toe_usrreqs; tcp6_protosw = pffindproto(PF_INET6, IPPROTO_TCP, SOCK_STREAM); if (tcp6_protosw == NULL) return (ENOPROTOOPT); bcopy(tcp6_protosw, &toe6_protosw, sizeof(toe6_protosw)); bcopy(tcp6_protosw->pr_usrreqs, &toe6_usrreqs, sizeof(toe6_usrreqs)); toe6_usrreqs.pru_aio_queue = t4_aio_queue_tom; toe6_protosw.pr_usrreqs = &toe6_usrreqs; TIMEOUT_TASK_INIT(taskqueue_thread, &clip_task, 0, t4_clip_task, NULL); ifaddr_evhandler = EVENTHANDLER_REGISTER(ifaddr_event, t4_tom_ifaddr_event, NULL, EVENTHANDLER_PRI_ANY); rc = t4_register_uld(&tom_uld_info); if (rc != 0) t4_tom_mod_unload(); return (rc); } static void tom_uninit(struct adapter *sc, void *arg __unused) { if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4tomun")) return; /* Try to free resources (works only if no port has IFCAP_TOE) */ if (uld_active(sc, ULD_TOM)) t4_deactivate_uld(sc, ULD_TOM); end_synchronized_op(sc, 0); } static int t4_tom_mod_unload(void) { t4_iterate(tom_uninit, NULL); if (t4_unregister_uld(&tom_uld_info) == EBUSY) return (EBUSY); if (ifaddr_evhandler) { EVENTHANDLER_DEREGISTER(ifaddr_event, ifaddr_evhandler); taskqueue_cancel_timeout(taskqueue_thread, &clip_task, NULL); } t4_ddp_mod_unload(); t4_uninit_connect_cpl_handlers(); t4_uninit_listen_cpl_handlers(); t4_uninit_cpl_io_handlers(); return (0); } #endif /* TCP_OFFLOAD */ static int t4_tom_modevent(module_t mod, int cmd, void *arg) { int rc = 0; #ifdef TCP_OFFLOAD switch (cmd) { case MOD_LOAD: rc = t4_tom_mod_load(); break; case MOD_UNLOAD: rc = t4_tom_mod_unload(); break; default: rc = EINVAL; } #else printf("t4_tom: compiled without TCP_OFFLOAD support.\n"); rc = EOPNOTSUPP; #endif return (rc); } static moduledata_t t4_tom_moddata= { "t4_tom", t4_tom_modevent, 0 }; MODULE_VERSION(t4_tom, 1); MODULE_DEPEND(t4_tom, toecore, 1, 1, 1); MODULE_DEPEND(t4_tom, t4nex, 1, 1, 1); DECLARE_MODULE(t4_tom, t4_tom_moddata, SI_SUB_EXEC, SI_ORDER_ANY); Index: stable/11/sys/netinet/tcp_timer.c =================================================================== --- stable/11/sys/netinet/tcp_timer.c (revision 330302) +++ stable/11/sys/netinet/tcp_timer.c (revision 330303) @@ -1,1006 +1,1008 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_tcpdebug.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #include #include #ifdef INET6 #include #endif #include #ifdef TCPDEBUG #include #endif int tcp_persmin; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW, &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval"); int tcp_persmax; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW, &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval"); int tcp_keepinit; SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection"); int tcp_keepidle; SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin"); int tcp_keepintvl; SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes"); int tcp_delacktime; SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW, &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", "Time before a delayed ACK is sent"); int tcp_msl; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); int tcp_rexmit_min; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW, &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", "Minimum Retransmission Timeout"); int tcp_rexmit_slop; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW, &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", "Retransmission Timer Slop"); -static int always_keepalive = 1; +int tcp_always_keepalive = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, - &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); + &tcp_always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); +__strong_reference(tcp_always_keepalive, always_keepalive); int tcp_fast_finwait2_recycle = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, &tcp_fast_finwait2_recycle, 0, "Recycle closed FIN_WAIT_2 connections faster"); int tcp_finwait2_timeout; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW, &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout"); int tcp_keepcnt = TCPTV_KEEPCNT; SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, "Number of keepalive probes to send"); /* max idle probes */ int tcp_maxpersistidle; static int tcp_rexmit_drop_options = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, &tcp_rexmit_drop_options, 0, "Drop TCP options from 3rd and later retransmitted SYN"); static VNET_DEFINE(int, tcp_pmtud_blackhole_detect); #define V_tcp_pmtud_blackhole_detect VNET(tcp_pmtud_blackhole_detect) SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, CTLFLAG_RW|CTLFLAG_VNET, &VNET_NAME(tcp_pmtud_blackhole_detect), 0, "Path MTU Discovery Black Hole Detection Enabled"); static VNET_DEFINE(int, tcp_pmtud_blackhole_activated); #define V_tcp_pmtud_blackhole_activated \ VNET(tcp_pmtud_blackhole_activated) SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated, CTLFLAG_RD|CTLFLAG_VNET, &VNET_NAME(tcp_pmtud_blackhole_activated), 0, "Path MTU Discovery Black Hole Detection, Activation Count"); static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss); #define V_tcp_pmtud_blackhole_activated_min_mss \ VNET(tcp_pmtud_blackhole_activated_min_mss) SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss, CTLFLAG_RD|CTLFLAG_VNET, &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0, "Path MTU Discovery Black Hole Detection, Activation Count at min MSS"); static VNET_DEFINE(int, tcp_pmtud_blackhole_failed); #define V_tcp_pmtud_blackhole_failed VNET(tcp_pmtud_blackhole_failed) SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed, CTLFLAG_RD|CTLFLAG_VNET, &VNET_NAME(tcp_pmtud_blackhole_failed), 0, "Path MTU Discovery Black Hole Detection, Failure Count"); #ifdef INET static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; #define V_tcp_pmtud_blackhole_mss VNET(tcp_pmtud_blackhole_mss) SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, CTLFLAG_RW|CTLFLAG_VNET, &VNET_NAME(tcp_pmtud_blackhole_mss), 0, "Path MTU Discovery Black Hole Detection lowered MSS"); #endif #ifdef INET6 static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; #define V_tcp_v6pmtud_blackhole_mss VNET(tcp_v6pmtud_blackhole_mss) SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, CTLFLAG_RW|CTLFLAG_VNET, &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); #endif #ifdef RSS static int per_cpu_timers = 1; #else static int per_cpu_timers = 0; #endif SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, &per_cpu_timers , 0, "run tcp timers on all cpus"); #if 0 #define INP_CPU(inp) (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \ ((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0) #endif /* * Map the given inp to a CPU id. * * This queries RSS if it's compiled in, else it defaults to the current * CPU ID. */ static inline int inp_to_cpuid(struct inpcb *inp) { u_int cpuid; #ifdef RSS if (per_cpu_timers) { cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); if (cpuid == NETISR_CPUID_NONE) return (curcpu); /* XXX */ else return (cpuid); } #else /* Legacy, pre-RSS behaviour */ if (per_cpu_timers) { /* * We don't have a flowid -> cpuid mapping, so cheat and * just map unknown cpuids to curcpu. Not the best, but * apparently better than defaulting to swi 0. */ cpuid = inp->inp_flowid % (mp_maxid + 1); if (! CPU_ABSENT(cpuid)) return (cpuid); return (curcpu); } #endif /* Default for RSS and non-RSS - cpuid 0 */ else { return (0); } } /* * Tcp protocol timeout routine called every 500 ms. * Updates timestamps used for TCP * causes finite state machine actions if timers expire. */ void tcp_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); (void) tcp_tw_2msl_scan(0); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; int tcp_backoff[TCP_MAXRXTSHIFT + 1] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ /* * TCP timer processing. */ void tcp_timer_delack(void *xtp) { struct tcpcb *tp = xtp; struct inpcb *inp; CURVNET_SET(tp->t_vnet); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_delack) || !callout_active(&tp->t_timers->tt_delack)) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_delack); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); KASSERT((tp->t_timers->tt_flags & TT_DELACK) != 0, ("%s: tp %p delack callout should be running", __func__, tp)); tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_delack); (void) tp->t_fb->tfb_tcp_output(tp); INP_WUNLOCK(inp); CURVNET_RESTORE(); } void tcp_timer_2msl(void *xtp) { struct tcpcb *tp = xtp; struct inpcb *inp; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif INP_INFO_RLOCK(&V_tcbinfo); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); tcp_free_sackholes(tp); if (callout_pending(&tp->t_timers->tt_2msl) || !callout_active(&tp->t_timers->tt_2msl)) { INP_WUNLOCK(tp->t_inpcb); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_2msl); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); KASSERT((tp->t_timers->tt_flags & TT_2MSL) != 0, ("%s: tp %p 2msl callout should be running", __func__, tp)); /* * 2 MSL timeout in shutdown went off. If we're closed but * still waiting for peer to close and connection has been idle * too long delete connection control block. Otherwise, check * again in a bit. * * If in TIME_WAIT state just ignore as this timeout is handled in * tcp_tw_2msl_scan(). * * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. * Ignore fact that there were recent incoming segments. */ if ((inp->inp_flags & INP_TIMEWAIT) != 0) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 && tp->t_inpcb && tp->t_inpcb->inp_socket && (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { TCPSTAT_INC(tcps_finwait2_drops); tp = tcp_close(tp); } else { if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) { if (!callout_reset(&tp->t_timers->tt_2msl, TP_KEEPINTVL(tp), tcp_timer_2msl, tp)) { tp->t_timers->tt_flags &= ~TT_2MSL_RST; } } else tp = tcp_close(tp); } #ifdef TCPDEBUG if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); if (tp != NULL) INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } void tcp_timer_keep(void *xtp) { struct tcpcb *tp = xtp; struct tcptemp *t_template; struct inpcb *inp; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif INP_INFO_RLOCK(&V_tcbinfo); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_keep) || !callout_active(&tp->t_timers->tt_keep)) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_keep); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); KASSERT((tp->t_timers->tt_flags & TT_KEEP) != 0, ("%s: tp %p keep callout should be running", __func__, tp)); /* * Keep-alive timer went off; send something * or drop connection if idle for too long. */ TCPSTAT_INC(tcps_keeptimeo); if (tp->t_state < TCPS_ESTABLISHED) goto dropit; - if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && + if ((tcp_always_keepalive || + inp->inp_socket->so_options & SO_KEEPALIVE) && tp->t_state <= TCPS_CLOSING) { if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) goto dropit; /* * Send a packet designed to force a response * if the peer is up and reachable: * either an ACK if the connection is still alive, * or an RST if the peer has closed the connection * due to timeout or reboot. * Using sequence number tp->snd_una-1 * causes the transmitted zero-length segment * to lie outside the receive window; * by the protocol spec, this requires the * correspondent TCP to respond. */ TCPSTAT_INC(tcps_keepprobe); t_template = tcpip_maketemplate(inp); if (t_template) { tcp_respond(tp, t_template->tt_ipgen, &t_template->tt_t, (struct mbuf *)NULL, tp->rcv_nxt, tp->snd_una - 1, 0); free(t_template, M_TEMP); } if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), tcp_timer_keep, tp)) { tp->t_timers->tt_flags &= ~TT_KEEP_RST; } } else if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), tcp_timer_keep, tp)) { tp->t_timers->tt_flags &= ~TT_KEEP_RST; } #ifdef TCPDEBUG if (inp->inp_socket->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; dropit: TCPSTAT_INC(tcps_keepdrops); tp = tcp_drop(tp, ETIMEDOUT); #ifdef TCPDEBUG if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); if (tp != NULL) INP_WUNLOCK(tp->t_inpcb); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } void tcp_timer_persist(void *xtp) { struct tcpcb *tp = xtp; struct inpcb *inp; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif INP_INFO_RLOCK(&V_tcbinfo); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_persist) || !callout_active(&tp->t_timers->tt_persist)) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_persist); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); KASSERT((tp->t_timers->tt_flags & TT_PERSIST) != 0, ("%s: tp %p persist callout should be running", __func__, tp)); /* * Persistence timer into zero window. * Force a byte to be output, if possible. */ TCPSTAT_INC(tcps_persisttimeo); /* * Hack: if the peer is dead/unreachable, we do not * time out if the window is closed. After a full * backoff, drop the connection if the idle time * (no responses to probes) reaches the maximum * backoff that we would use if retransmitting. */ if (tp->t_rxtshift == TCP_MAXRXTSHIFT && (ticks - tp->t_rcvtime >= tcp_maxpersistidle || ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { TCPSTAT_INC(tcps_persistdrop); tp = tcp_drop(tp, ETIMEDOUT); goto out; } /* * If the user has closed the socket then drop a persisting * connection after a much reduced timeout. */ if (tp->t_state > TCPS_CLOSE_WAIT && (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { TCPSTAT_INC(tcps_persistdrop); tp = tcp_drop(tp, ETIMEDOUT); goto out; } tcp_setpersist(tp); tp->t_flags |= TF_FORCEDATA; (void) tp->t_fb->tfb_tcp_output(tp); tp->t_flags &= ~TF_FORCEDATA; out: #ifdef TCPDEBUG if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); if (tp != NULL) INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } void tcp_timer_rexmt(void * xtp) { struct tcpcb *tp = xtp; CURVNET_SET(tp->t_vnet); int rexmt; int headlocked; struct inpcb *inp; #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif INP_INFO_RLOCK(&V_tcbinfo); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_rexmt) || !callout_active(&tp->t_timers->tt_rexmt)) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_rexmt); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); KASSERT((tp->t_timers->tt_flags & TT_REXMT) != 0, ("%s: tp %p rexmt callout should be running", __func__, tp)); tcp_free_sackholes(tp); if (tp->t_fb->tfb_tcp_rexmit_tmr) { /* The stack has a timer action too. */ (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp); } /* * Retransmission timer went off. Message has not * been acked within retransmit interval. Back off * to a longer retransmit interval and retransmit one segment. */ if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; TCPSTAT_INC(tcps_timeoutdrop); tp = tcp_drop(tp, tp->t_softerror ? tp->t_softerror : ETIMEDOUT); headlocked = 1; goto out; } INP_INFO_RUNLOCK(&V_tcbinfo); headlocked = 0; if (tp->t_state == TCPS_SYN_SENT) { /* * If the SYN was retransmitted, indicate CWND to be * limited to 1 segment in cc_conn_init(). */ tp->snd_cwnd = 1; } else if (tp->t_rxtshift == 1) { /* * first retransmit; record ssthresh and cwnd so they can * be recovered if this turns out to be a "bad" retransmit. * A retransmit is considered "bad" if an ACK for this * segment is received within RTT/2 interval; the assumption * here is that the ACK was already in flight. See * "On Estimating End-to-End Network Path Properties" by * Allman and Paxson for more details. */ tp->snd_cwnd_prev = tp->snd_cwnd; tp->snd_ssthresh_prev = tp->snd_ssthresh; tp->snd_recover_prev = tp->snd_recover; if (IN_FASTRECOVERY(tp->t_flags)) tp->t_flags |= TF_WASFRECOVERY; else tp->t_flags &= ~TF_WASFRECOVERY; if (IN_CONGRECOVERY(tp->t_flags)) tp->t_flags |= TF_WASCRECOVERY; else tp->t_flags &= ~TF_WASCRECOVERY; tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); tp->t_flags |= TF_PREVVALID; } else tp->t_flags &= ~TF_PREVVALID; TCPSTAT_INC(tcps_rexmttimeo); if ((tp->t_state == TCPS_SYN_SENT) || (tp->t_state == TCPS_SYN_RECEIVED)) rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift]; else rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX); /* * We enter the path for PLMTUD if connection is established or, if * connection is FIN_WAIT_1 status, reason for the last is that if * amount of data we send is very small, we could send it in couple of * packets and process straight to FIN. In that case we won't catch * ESTABLISHED state. */ if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) || (tp->t_state == TCPS_FIN_WAIT_1))) { #ifdef INET6 int isipv6; #endif /* * Idea here is that at each stage of mtu probe (usually, 1448 * -> 1188 -> 524) should be given 2 chances to recover before * further clamping down. 'tp->t_rxtshift % 2 == 0' should * take care of that. */ if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && (tp->t_rxtshift >= 2 && tp->t_rxtshift % 2 == 0)) { /* * Enter Path MTU Black-hole Detection mechanism: * - Disable Path MTU Discovery (IP "DF" bit). * - Reduce MTU to lower value than what we * negotiated with peer. */ /* Record that we may have found a black hole. */ tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; /* Keep track of previous MSS. */ tp->t_pmtud_saved_maxseg = tp->t_maxseg; /* * Reduce the MSS to blackhole value or to the default * in an attempt to retransmit. */ #ifdef INET6 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; if (isipv6 && tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { /* Use the sysctl tuneable blackhole MSS. */ tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; V_tcp_pmtud_blackhole_activated++; } else if (isipv6) { /* Use the default MSS. */ tp->t_maxseg = V_tcp_v6mssdflt; /* * Disable Path MTU Discovery when we switch to * minmss. */ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; V_tcp_pmtud_blackhole_activated_min_mss++; } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { /* Use the sysctl tuneable blackhole MSS. */ tp->t_maxseg = V_tcp_pmtud_blackhole_mss; V_tcp_pmtud_blackhole_activated++; } else { /* Use the default MSS. */ tp->t_maxseg = V_tcp_mssdflt; /* * Disable Path MTU Discovery when we switch to * minmss. */ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; V_tcp_pmtud_blackhole_activated_min_mss++; } #endif /* * Reset the slow-start flight size * as it may depend on the new MSS. */ if (CC_ALGO(tp)->conn_init != NULL) CC_ALGO(tp)->conn_init(tp->ccv); } else { /* * If further retransmissions are still unsuccessful * with a lowered MTU, maybe this isn't a blackhole and * we restore the previous MSS and blackhole detection * flags. * The limit '6' is determined by giving each probe * stage (1448, 1188, 524) 2 chances to recover. */ if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && (tp->t_rxtshift > 6)) { tp->t_flags2 |= TF2_PLPMTU_PMTUD; tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; tp->t_maxseg = tp->t_pmtud_saved_maxseg; V_tcp_pmtud_blackhole_failed++; /* * Reset the slow-start flight size as it * may depend on the new MSS. */ if (CC_ALGO(tp)->conn_init != NULL) CC_ALGO(tp)->conn_init(tp->ccv); } } } /* * Disable RFC1323 and SACK if we haven't got any response to * our third SYN to work-around some broken terminal servers * (most of which have hopefully been retired) that have bad VJ * header compression code which trashes TCP segments containing * unknown-to-them TCP options. */ if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == 3)) tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); /* * If we backed off this far, our srtt estimate is probably bogus. * Clobber it so we'll take the next rtt measurement as our srtt; * move the current srtt into rttvar to keep the current * retransmit times until then. */ if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { #ifdef INET6 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) in6_losing(tp->t_inpcb); else #endif in_losing(tp->t_inpcb); tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); tp->t_srtt = 0; } tp->snd_nxt = tp->snd_una; tp->snd_recover = tp->snd_max; /* * Force a segment to be sent. */ tp->t_flags |= TF_ACKNOW; /* * If timing a segment in this window, stop the timer. */ tp->t_rtttime = 0; cc_cong_signal(tp, NULL, CC_RTO); (void) tp->t_fb->tfb_tcp_output(tp); out: #ifdef TCPDEBUG if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); if (tp != NULL) INP_WUNLOCK(inp); if (headlocked) INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } void tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta) { struct callout *t_callout; timeout_t *f_callout; struct inpcb *inp = tp->t_inpcb; int cpu = inp_to_cpuid(inp); uint32_t f_reset; #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) return; #endif if (tp->t_timers->tt_flags & TT_STOPPED) return; switch (timer_type) { case TT_DELACK: t_callout = &tp->t_timers->tt_delack; f_callout = tcp_timer_delack; f_reset = TT_DELACK_RST; break; case TT_REXMT: t_callout = &tp->t_timers->tt_rexmt; f_callout = tcp_timer_rexmt; f_reset = TT_REXMT_RST; break; case TT_PERSIST: t_callout = &tp->t_timers->tt_persist; f_callout = tcp_timer_persist; f_reset = TT_PERSIST_RST; break; case TT_KEEP: t_callout = &tp->t_timers->tt_keep; f_callout = tcp_timer_keep; f_reset = TT_KEEP_RST; break; case TT_2MSL: t_callout = &tp->t_timers->tt_2msl; f_callout = tcp_timer_2msl; f_reset = TT_2MSL_RST; break; default: if (tp->t_fb->tfb_tcp_timer_activate) { tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta); return; } panic("tp %p bad timer_type %#x", tp, timer_type); } if (delta == 0) { if ((tp->t_timers->tt_flags & timer_type) && (callout_stop(t_callout) > 0) && (tp->t_timers->tt_flags & f_reset)) { tp->t_timers->tt_flags &= ~(timer_type | f_reset); } } else { if ((tp->t_timers->tt_flags & timer_type) == 0) { tp->t_timers->tt_flags |= (timer_type | f_reset); callout_reset_on(t_callout, delta, f_callout, tp, cpu); } else { /* Reset already running callout on the same CPU. */ if (!callout_reset(t_callout, delta, f_callout, tp)) { /* * Callout not cancelled, consider it as not * properly restarted. */ tp->t_timers->tt_flags &= ~f_reset; } } } } int tcp_timer_active(struct tcpcb *tp, uint32_t timer_type) { struct callout *t_callout; switch (timer_type) { case TT_DELACK: t_callout = &tp->t_timers->tt_delack; break; case TT_REXMT: t_callout = &tp->t_timers->tt_rexmt; break; case TT_PERSIST: t_callout = &tp->t_timers->tt_persist; break; case TT_KEEP: t_callout = &tp->t_timers->tt_keep; break; case TT_2MSL: t_callout = &tp->t_timers->tt_2msl; break; default: if (tp->t_fb->tfb_tcp_timer_active) { return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type)); } panic("tp %p bad timer_type %#x", tp, timer_type); } return callout_active(t_callout); } void tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type) { struct callout *t_callout; uint32_t f_reset; tp->t_timers->tt_flags |= TT_STOPPED; switch (timer_type) { case TT_DELACK: t_callout = &tp->t_timers->tt_delack; f_reset = TT_DELACK_RST; break; case TT_REXMT: t_callout = &tp->t_timers->tt_rexmt; f_reset = TT_REXMT_RST; break; case TT_PERSIST: t_callout = &tp->t_timers->tt_persist; f_reset = TT_PERSIST_RST; break; case TT_KEEP: t_callout = &tp->t_timers->tt_keep; f_reset = TT_KEEP_RST; break; case TT_2MSL: t_callout = &tp->t_timers->tt_2msl; f_reset = TT_2MSL_RST; break; default: if (tp->t_fb->tfb_tcp_timer_stop) { /* * XXXrrs we need to look at this with the * stop case below (flags). */ tp->t_fb->tfb_tcp_timer_stop(tp, timer_type); return; } panic("tp %p bad timer_type %#x", tp, timer_type); } if (tp->t_timers->tt_flags & timer_type) { if (callout_async_drain(t_callout, tcp_timer_discard) == 0) { /* * Can't stop the callout, defer tcpcb actual deletion * to the last one. We do this using the async drain * function and incrementing the count in */ tp->t_timers->tt_draincnt++; } } } #define ticks_to_msecs(t) (1000*(t) / hz) void tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, struct xtcp_timer *xtimer) { sbintime_t now; bzero(xtimer, sizeof(*xtimer)); if (timer == NULL) return; now = getsbinuptime(); if (callout_active(&timer->tt_delack)) xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_rexmt)) xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_persist)) xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_keep)) xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_2msl)) xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS; xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime); } Index: stable/11/sys/netinet/tcp_timer.h =================================================================== --- stable/11/sys/netinet/tcp_timer.h (revision 330302) +++ stable/11/sys/netinet/tcp_timer.h (revision 330303) @@ -1,208 +1,209 @@ /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_timer.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #ifndef _NETINET_TCP_TIMER_H_ #define _NETINET_TCP_TIMER_H_ /* * The TCPT_REXMT timer is used to force retransmissions. * The TCP has the TCPT_REXMT timer set whenever segments * have been sent for which ACKs are expected but not yet * received. If an ACK is received which advances tp->snd_una, * then the retransmit timer is cleared (if there are no more * outstanding segments) or reset to the base value (if there * are more ACKs expected). Whenever the retransmit timer goes off, * we retransmit one unacknowledged segment, and do a backoff * on the retransmit timer. * * The TCPT_PERSIST timer is used to keep window size information * flowing even if the window goes shut. If all previous transmissions * have been acknowledged (so that there are no retransmissions in progress), * and the window is too small to bother sending anything, then we start * the TCPT_PERSIST timer. When it expires, if the window is nonzero, * we go to transmit state. Otherwise, at intervals send a single byte * into the peer's window to force him to update our window information. * We do this at most as often as TCPT_PERSMIN time intervals, * but no more frequently than the current estimate of round-trip * packet time. The TCPT_PERSIST timer is cleared whenever we receive * a window update from the peer. * * The TCPT_KEEP timer is used to keep connections alive. If an * connection is idle (no segments received) for TCPTV_KEEP_INIT amount of time, * but not yet established, then we drop the connection. Once the connection * is established, if the connection is idle for TCPTV_KEEP_IDLE time * (and keepalives have been enabled on the socket), we begin to probe * the connection. We force the peer to send us a segment by sending: * * This segment is (deliberately) outside the window, and should elicit * an ack segment in response from the peer. If, despite the TCPT_KEEP * initiated segments we cannot elicit a response from a peer in TCPT_MAXIDLE * amount of time probing, then we drop the connection. */ /* * Time constants. */ #define TCPTV_MSL ( 30*hz) /* max seg lifetime (hah!) */ #define TCPTV_SRTTBASE 0 /* base roundtrip time; if 0, no idea yet */ #define TCPTV_RTOBASE ( 3*hz) /* assumed RTO if no info */ #define TCPTV_PERSMIN ( 5*hz) /* minimum persist interval */ #define TCPTV_PERSMAX ( 60*hz) /* maximum persist interval */ #define TCPTV_KEEP_INIT ( 75*hz) /* initial connect keepalive */ #define TCPTV_KEEP_IDLE (120*60*hz) /* dflt time before probing */ #define TCPTV_KEEPINTVL ( 75*hz) /* default probe interval */ #define TCPTV_KEEPCNT 8 /* max probes before drop */ #define TCPTV_FINWAIT2_TIMEOUT (60*hz) /* FIN_WAIT_2 timeout if no receiver */ /* * Minimum retransmit timer is 3 ticks, for algorithmic stability. * TCPT_RANGESET() will add another TCPTV_CPU_VAR to deal with * the expected worst-case processing variances by the kernels * representing the end points. Such variances do not always show * up in the srtt because the timestamp is often calculated at * the interface rather then at the TCP layer. This value is * typically 50ms. However, it is also possible that delayed * acks (typically 100ms) could create issues so we set the slop * to 200ms to try to cover it. Note that, properly speaking, * delayed-acks should not create a major issue for interactive * environments which 'P'ush the last segment, at least as * long as implementations do the required 'at least one ack * for every two packets' for the non-interactive streaming case. * (maybe the RTO calculation should use 2*RTT instead of RTT * to handle the ack-every-other-packet case). * * The prior minimum of 1*hz (1 second) badly breaks throughput on any * networks faster then a modem that has minor (e.g. 1%) packet loss. */ #define TCPTV_MIN ( hz/33 ) /* minimum allowable value */ #define TCPTV_CPU_VAR ( hz/5 ) /* cpu variance allowed (200ms) */ #define TCPTV_REXMTMAX ( 64*hz) /* max allowable REXMT value */ #define TCPTV_TWTRUNC 8 /* RTO factor to truncate TW */ #define TCP_LINGERTIME 120 /* linger at most 2 minutes */ #define TCP_MAXRXTSHIFT 12 /* maximum retransmits */ #define TCPTV_DELACK ( hz/10 ) /* 100ms timeout */ #ifdef TCPTIMERS static const char *tcptimers[] = { "REXMT", "PERSIST", "KEEP", "2MSL", "DELACK" }; #endif /* * Force a time value to be in a certain range. */ #define TCPT_RANGESET(tv, value, tvmin, tvmax) do { \ (tv) = (value) + tcp_rexmit_slop; \ if ((u_long)(tv) < (u_long)(tvmin)) \ (tv) = (tvmin); \ if ((u_long)(tv) > (u_long)(tvmax)) \ (tv) = (tvmax); \ } while(0) #ifdef _KERNEL struct xtcp_timer; struct tcp_timer { struct callout tt_rexmt; /* retransmit timer */ struct callout tt_persist; /* retransmit persistence */ struct callout tt_keep; /* keepalive */ struct callout tt_2msl; /* 2*msl TIME_WAIT timer */ struct callout tt_delack; /* delayed ACK timer */ uint32_t tt_flags; /* Timers flags */ uint32_t tt_draincnt; /* Count being drained */ }; /* * Flags for the tt_flags field. */ #define TT_DELACK 0x0001 #define TT_REXMT 0x0002 #define TT_PERSIST 0x0004 #define TT_KEEP 0x0008 #define TT_2MSL 0x0010 #define TT_MASK (TT_DELACK|TT_REXMT|TT_PERSIST|TT_KEEP|TT_2MSL) #define TT_DELACK_RST 0x0100 #define TT_REXMT_RST 0x0200 #define TT_PERSIST_RST 0x0400 #define TT_KEEP_RST 0x0800 #define TT_2MSL_RST 0x1000 #define TT_STOPPED 0x00010000 #define TP_KEEPINIT(tp) ((tp)->t_keepinit ? (tp)->t_keepinit : tcp_keepinit) #define TP_KEEPIDLE(tp) ((tp)->t_keepidle ? (tp)->t_keepidle : tcp_keepidle) #define TP_KEEPINTVL(tp) ((tp)->t_keepintvl ? (tp)->t_keepintvl : tcp_keepintvl) #define TP_KEEPCNT(tp) ((tp)->t_keepcnt ? (tp)->t_keepcnt : tcp_keepcnt) #define TP_MAXIDLE(tp) (TP_KEEPCNT(tp) * TP_KEEPINTVL(tp)) extern int tcp_persmin; /* minimum persist interval */ extern int tcp_persmax; /* maximum persist interval */ extern int tcp_keepinit; /* time to establish connection */ extern int tcp_keepidle; /* time before keepalive probes begin */ extern int tcp_keepintvl; /* time between keepalive probes */ extern int tcp_keepcnt; /* number of keepalives */ extern int tcp_delacktime; /* time before sending a delayed ACK */ extern int tcp_maxpersistidle; extern int tcp_rexmit_min; extern int tcp_rexmit_slop; extern int tcp_msl; extern int tcp_ttl; /* time to live for TCP segs */ extern int tcp_backoff[]; extern int tcp_syn_backoff[]; +extern int tcp_always_keepalive; extern int tcp_finwait2_timeout; extern int tcp_fast_finwait2_recycle; void tcp_timer_init(void); void tcp_timer_2msl(void *xtp); void tcp_timer_discard(void *); struct tcptw * tcp_tw_2msl_scan(int reuse); /* XXX temporary? */ void tcp_timer_keep(void *xtp); void tcp_timer_persist(void *xtp); void tcp_timer_rexmt(void *xtp); void tcp_timer_delack(void *xtp); void tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, struct xtcp_timer *xtimer); #endif /* _KERNEL */ #endif /* !_NETINET_TCP_TIMER_H_ */ Index: stable/11 =================================================================== --- stable/11 (revision 330302) +++ stable/11 (revision 330303) Property changes on: stable/11 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r328608