Page MenuHomeFreeBSD

D4055.id9858.diff
No OneTemporary

D4055.id9858.diff

Index: sys/dev/cxgb/ulp/tom/cxgb_listen.c
===================================================================
--- sys/dev/cxgb/ulp/tom/cxgb_listen.c
+++ sys/dev/cxgb/ulp/tom/cxgb_listen.c
@@ -753,7 +753,9 @@
INP_WLOCK_ASSERT(new_inp);
if (__predict_false(!(synqe->flags & TP_SYNQE_EXPANDED))) {
- tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0);
+ struct tcpcb *tp;
+ tp = intotcpcb(new_inp);
+ tp->t_fb->tcp_timer_activate(tp, TT_KEEP, 0);
t3_offload_socket(tod, synqe, so);
}
Index: sys/dev/cxgbe/tom/t4_listen.c
===================================================================
--- sys/dev/cxgbe/tom/t4_listen.c
+++ sys/dev/cxgbe/tom/t4_listen.c
@@ -1548,7 +1548,9 @@
* this somewhat defeats the purpose of having a tod_offload_socket :-(
*/
if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) {
- tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0);
+ struct tcpcb *tp;
+ tp = intotcpcb(new_inp);
+ tp->t_fb->tcp_timer_activate(tp, TT_KEEP, 0);
t4_offload_socket(TOEDEV(ifp), synqe, so);
}
Index: sys/modules/Makefile
===================================================================
--- sys/modules/Makefile
+++ sys/modules/Makefile
@@ -343,6 +343,7 @@
${_syscons} \
sysvipc \
${_ti} \
+ tcp/fastpath \
tl \
tmpfs \
${_toecore} \
Index: sys/modules/tcp/fastpath/Makefile
===================================================================
--- sys/modules/tcp/fastpath/Makefile
+++ sys/modules/tcp/fastpath/Makefile
@@ -0,0 +1,15 @@
+#
+# $FreeBSD$
+#
+
+.PATH: ${.CURDIR}
+
+KMOD= fastpath
+SRCS= fastpath.c
+
+#
+# Enable full debugging
+#
+#CFLAGS += -g
+
+.include <bsd.kmod.mk>
Index: sys/modules/tcp/fastpath/fastpath.c
===================================================================
--- sys/modules/tcp/fastpath/fastpath.c
+++ sys/modules/tcp/fastpath/fastpath.c
@@ -0,0 +1,2411 @@
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ * Copyright (c) 2007-2008,2010
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * Copyright (c) 2010-2011 Juniper Networks, Inc.
+ * Copyright (c) 2015 Netflix Inc.
+ * All rights reserved.
+ *
+ * Portions of this software were developed at the Centre for Advanced Internet
+ * Architectures, Swinburne University of Technology, by Lawrence Stewart,
+ * James Healy and David Hayes, made possible in part by a grant from the Cisco
+ * University Research Program Fund at Community Foundation Silicon Valley.
+ *
+ * Portions of this software were developed at the Centre for Advanced
+ * Internet Architectures, Swinburne University of Technology, Melbourne,
+ * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
+ *
+ * Portions of this software were developed by Robert N. M. Watson under
+ * contract to Juniper Networks, Inc.
+ *
+ * Portions of this software were developed by Randall R. Stewart while
+ * working for Netflix Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ipfw.h" /* for ipfw_fwd */
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_kdtrace.h"
+#include "opt_tcpdebug.h"
+
+#include <sys/param.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#include <sys/hhook.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/proc.h> /* for proc0 declaration */
+#include <sys/protosw.h>
+#include <sys/sdt.h>
+#include <sys/signalvar.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+
+#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */
+
+#include <vm/uma.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/route.h>
+#include <net/vnet.h>
+
+#define TCPSTATES /* for logging */
+
+#include <netinet/cc.h>
+#include <netinet/in.h>
+#include <netinet/in_kdtrace.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h> /* required for icmp_var.h */
+#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
+#include <netinet/ip_var.h>
+#include <netinet/ip_options.h>
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#include <netinet6/in6_pcb.h>
+#include <netinet6/ip6_var.h>
+#include <netinet6/nd6.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet6/tcp6_var.h>
+#include <netinet/tcpip.h>
+#include <netinet/tcp_syncache.h>
+#ifdef TCPDEBUG
+#include <netinet/tcp_debug.h>
+#endif /* TCPDEBUG */
+#ifdef TCP_OFFLOAD
+#include <netinet/tcp_offload.h>
+#endif
+
+#ifdef IPSEC
+#include <netipsec/ipsec.h>
+#include <netipsec/ipsec6.h>
+#endif /*IPSEC*/
+
+#include <machine/in_cksum.h>
+
+#include <security/mac/mac_framework.h>
+
+const int tcprexmtthresh;
+
+VNET_DECLARE(int, tcp_autorcvbuf_inc);
+#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc)
+VNET_DECLARE(int, tcp_autorcvbuf_max);
+#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
+VNET_DECLARE(int, tcp_do_rfc3042);
+#define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042)
+VNET_DECLARE(int, tcp_do_autorcvbuf);
+#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf)
+VNET_DECLARE(int, tcp_insecure_rst);
+#define V_tcp_insecure_rst VNET(tcp_insecure_rst)
+VNET_DECLARE(int, tcp_insecure_syn);
+#define V_tcp_insecure_syn VNET(tcp_insecure_syn)
+
+
+
+
+extern void tcp_dooptions(struct tcpopt *, u_char *, int, int);
+extern void tcp_dropwithreset(struct mbuf *, struct tcphdr *,
+ struct tcpcb *, int, int);
+extern void tcp_pulloutofband(struct socket *,
+ struct tcphdr *, struct mbuf *, int);
+extern void tcp_xmit_timer(struct tcpcb *, int);
+extern void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
+extern void tcp_mss(struct tcpcb *tp, int offer);
+extern void cc_ack_received(struct tcpcb *tp, struct tcphdr *th,
+ uint16_t type);
+extern void cc_conn_init(struct tcpcb *tp);
+extern void cc_post_recovery(struct tcpcb *tp, struct tcphdr *th);
+extern void cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type);
+extern void hhook_run_tcp_est_in(struct tcpcb *tp,
+ struct tcphdr *th, struct tcpopt *to);
+
+extern void kmod_tcpstat_inc(int statnum);
+#ifdef TCP_SIGNATURE
+extern int tcp_signature_verify_input(struct mbuf *m, int off0, int tlen, int optlen,
+ struct tcpopt *to, struct tcphdr *th, u_int tcpbflag);
+#endif
+
+static void tcp_do_segment_fastslow(struct mbuf *, struct tcphdr *,
+ struct socket *, struct tcpcb *, int, int, uint8_t,
+ int);
+
+static void tcp_do_segment_fastack(struct mbuf *, struct tcphdr *,
+ struct socket *, struct tcpcb *, int, int, uint8_t,
+ int);
+
+/*
+ * Indicate whether this ack should be delayed. We can delay the ack if
+ * - there is no delayed ack timer in progress and
+ * - our last ack wasn't a 0-sized window. We never want to delay
+ * the ack that opens up a 0-sized window and
+ * - delayed acks are enabled or
+ * - this is a half-synchronized T/TCP connection.
+ * - the segment size is not larger than the MSS and LRO wasn't used
+ * for this segment.
+ */
+#define DELAY_ACK(tp, tlen) \
+ ((!tcp_timer_active(tp, TT_DELACK) && \
+ (tp->t_flags & TF_RXWIN0SENT) == 0) && \
+ (tlen <= tp->t_maxopd) && \
+ (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
+
+
+static void
+tcp_do_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen,
+ int ti_locked, u_long tiwin)
+{
+ int acked;
+ int winup_only=0;
+ /*
+ * The following if statment will be true if
+ * We have are doing the win_up_in_fp <and>
+ * - We have more new data (SEQ_LT(tp->snd_wl1, th->th_seq)) <or>
+ * - No more new data, but we have an ack for new data
+ * (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack))
+ * - No more new data, the same ack point but the window grew
+ * (tp->snd_wl1 == th->th_seq && tp->snd_wl2 == th->th_ack && twin > tp->snd_wnd)
+ */
+ if ((SEQ_LT(tp->snd_wl1, th->th_seq) ||
+ (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
+ (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
+ /* keep track of pure window updates */
+ if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) {
+ winup_only = 1;
+ TCPSTAT_INC(tcps_rcvwinupd);
+ }
+ tp->snd_wnd = tiwin;
+ tp->snd_wl1 = th->th_seq;
+ tp->snd_wl2 = th->th_ack;
+ if (tp->snd_wnd > tp->max_sndwnd)
+ tp->max_sndwnd = tp->snd_wnd;
+ }
+ /*
+ * If last ACK falls within this segment's sequence numbers,
+ * record the timestamp.
+ * NOTE that the test is modified according to the latest
+ * proposal of the tcplw@cray.com list (Braden 1993/04/26).
+ */
+ if ((to->to_flags & TOF_TS) != 0 &&
+ SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
+ tp->ts_recent_age = tcp_ts_getticks();
+ tp->ts_recent = to->to_tsval;
+ }
+ /*
+ * This is a pure ack for outstanding data.
+ */
+ if (ti_locked == TI_RLOCKED) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ }
+ ti_locked = TI_UNLOCKED;
+
+ TCPSTAT_INC(tcps_predack);
+
+ /*
+ * "bad retransmit" recovery.
+ */
+ if (tp->t_rxtshift == 1 &&
+ tp->t_flags & TF_PREVVALID &&
+ (int)(ticks - tp->t_badrxtwin) < 0) {
+ cc_cong_signal(tp, th, CC_RTO_ERR);
+ }
+
+ /*
+ * Recalculate the transmit timer / rtt.
+ *
+ * Some boxes send broken timestamp replies
+ * during the SYN+ACK phase, ignore
+ * timestamps of 0 or we could calculate a
+ * huge RTT and blow up the retransmit timer.
+ */
+ if ((to->to_flags & TOF_TS) != 0 &&
+ to->to_tsecr) {
+ u_int t;
+
+ t = tcp_ts_getticks() - to->to_tsecr;
+ if (!tp->t_rttlow || tp->t_rttlow > t)
+ tp->t_rttlow = t;
+ tcp_xmit_timer(tp,
+ TCP_TS_TO_TICKS(t) + 1);
+ } else if (tp->t_rtttime &&
+ SEQ_GT(th->th_ack, tp->t_rtseq)) {
+ if (!tp->t_rttlow ||
+ tp->t_rttlow > ticks - tp->t_rtttime)
+ tp->t_rttlow = ticks - tp->t_rtttime;
+ tcp_xmit_timer(tp,
+ ticks - tp->t_rtttime);
+ }
+ if (winup_only == 0) {
+ acked = BYTES_THIS_ACK(tp, th);
+
+ /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
+ hhook_run_tcp_est_in(tp, th, to);
+
+ TCPSTAT_ADD(tcps_rcvackbyte, acked);
+ sbdrop(&so->so_snd, acked);
+ if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
+ SEQ_LEQ(th->th_ack, tp->snd_recover))
+ tp->snd_recover = th->th_ack - 1;
+
+ /*
+ * Let the congestion control algorithm update
+ * congestion control related information. This
+ * typically means increasing the congestion
+ * window.
+ */
+ cc_ack_received(tp, th, CC_ACK);
+
+ tp->snd_una = th->th_ack;
+ /*
+ * Pull snd_wl2 up to prevent seq wrap relative
+ * to th_ack.
+ */
+ tp->snd_wl2 = th->th_ack;
+ tp->t_dupacks = 0;
+ m_freem(m);
+
+ /*
+ * If all outstanding data are acked, stop
+ * retransmit timer, otherwise restart timer
+ * using current (possibly backed-off) value.
+ * If process is waiting for space,
+ * wakeup/selwakeup/signal. If data
+ * are ready to send, let tcp_output
+ * decide between more output or persist.
+ */
+#ifdef TCPDEBUG
+ if (so->so_options & SO_DEBUG)
+ tcp_trace(TA_INPUT, ostate, tp,
+ (void *)tcp_saveipgen,
+ &tcp_savetcp, 0);
+#endif
+ if (tp->snd_una == tp->snd_max)
+ tcp_timer_activate(tp, TT_REXMT, 0);
+ else if (!tcp_timer_active(tp, TT_PERSIST))
+ tcp_timer_activate(tp, TT_REXMT,
+ tp->t_rxtcur);
+ } else {
+ /*
+ * Window update only, just free the mbufs and
+ * send out whatever we can.
+ */
+ m_freem(m);
+ }
+ sowwakeup(so);
+ if (sbavail(&so->so_snd))
+ (void) tcp_output(tp);
+ KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
+ __func__, ti_locked));
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ if (tp->t_flags & TF_DELACK) {
+ tp->t_flags &= ~TF_DELACK;
+ tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
+ }
+ INP_WUNLOCK(tp->t_inpcb);
+}
+
+
+static void
+tcp_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen,
+ int ti_locked, u_long tiwin)
+{
+ int newsize = 0; /* automatic sockbuf scaling */
+
+ /*
+ * If last ACK falls within this segment's sequence numbers,
+ * record the timestamp.
+ * NOTE that the test is modified according to the latest
+ * proposal of the tcplw@cray.com list (Braden 1993/04/26).
+ */
+ if ((to->to_flags & TOF_TS) != 0 &&
+ SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
+ tp->ts_recent_age = tcp_ts_getticks();
+ tp->ts_recent = to->to_tsval;
+ }
+
+ /*
+ * This is a pure, in-sequence data packet with
+ * nothing on the reassembly queue and we have enough
+ * buffer space to take it.
+ */
+ if (ti_locked == TI_RLOCKED) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ }
+ ti_locked = TI_UNLOCKED;
+
+ /* Clean receiver SACK report if present */
+ if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks)
+ tcp_clean_sackreport(tp);
+ TCPSTAT_INC(tcps_preddat);
+ tp->rcv_nxt += tlen;
+ /*
+ * Pull snd_wl1 up to prevent seq wrap relative to
+ * th_seq.
+ */
+ tp->snd_wl1 = th->th_seq;
+ /*
+ * Pull rcv_up up to prevent seq wrap relative to
+ * rcv_nxt.
+ */
+ tp->rcv_up = tp->rcv_nxt;
+ TCPSTAT_ADD(tcps_rcvbyte, tlen);
+#ifdef TCPDEBUG
+ if (so->so_options & SO_DEBUG)
+ tcp_trace(TA_INPUT, ostate, tp,
+ (void *)tcp_saveipgen, &tcp_savetcp, 0);
+#endif
+ /*
+ * Automatic sizing of receive socket buffer. Often the send
+ * buffer size is not optimally adjusted to the actual network
+ * conditions at hand (delay bandwidth product). Setting the
+ * buffer size too small limits throughput on links with high
+ * bandwidth and high delay (eg. trans-continental/oceanic links).
+ *
+ * On the receive side the socket buffer memory is only rarely
+ * used to any significant extent. This allows us to be much
+ * more aggressive in scaling the receive socket buffer. For
+ * the case that the buffer space is actually used to a large
+ * extent and we run out of kernel memory we can simply drop
+ * the new segments; TCP on the sender will just retransmit it
+ * later. Setting the buffer size too big may only consume too
+ * much kernel memory if the application doesn't read() from
+ * the socket or packet loss or reordering makes use of the
+ * reassembly queue.
+ *
+ * The criteria to step up the receive buffer one notch are:
+ * 1. the number of bytes received during the time it takes
+ * one timestamp to be reflected back to us (the RTT);
+ * 2. received bytes per RTT is within seven eighth of the
+ * current socket buffer size;
+ * 3. receive buffer size has not hit maximal automatic size;
+ *
+ * This algorithm does one step per RTT at most and only if
+ * we receive a bulk stream w/o packet losses or reorderings.
+ * Shrinking the buffer during idle times is not necessary as
+ * it doesn't consume any memory when idle.
+ *
+ * TODO: Only step up if the application is actually serving
+ * the buffer to better manage the socket buffer resources.
+ */
+ if (V_tcp_do_autorcvbuf &&
+ to->to_tsecr &&
+ (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
+ if (TSTMP_GT(to->to_tsecr, tp->rfbuf_ts) &&
+ to->to_tsecr - tp->rfbuf_ts < hz) {
+ if (tp->rfbuf_cnt >
+ (so->so_rcv.sb_hiwat / 8 * 7) &&
+ so->so_rcv.sb_hiwat <
+ V_tcp_autorcvbuf_max) {
+ newsize =
+ min(so->so_rcv.sb_hiwat +
+ V_tcp_autorcvbuf_inc,
+ V_tcp_autorcvbuf_max);
+ }
+ /* Start over with next RTT. */
+ tp->rfbuf_ts = 0;
+ tp->rfbuf_cnt = 0;
+ } else
+ tp->rfbuf_cnt += tlen; /* add up */
+ }
+
+ /* Add data to socket buffer. */
+ SOCKBUF_LOCK(&so->so_rcv);
+ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+ m_freem(m);
+ } else {
+ /*
+ * Set new socket buffer size.
+ * Give up when limit is reached.
+ */
+ if (newsize)
+ if (!sbreserve_locked(&so->so_rcv,
+ newsize, so, NULL))
+ so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
+ m_adj(m, drop_hdrlen); /* delayed header drop */
+ sbappendstream_locked(&so->so_rcv, m, 0);
+ }
+ /* NB: sorwakeup_locked() does an implicit unlock. */
+ sorwakeup_locked(so);
+ if (DELAY_ACK(tp, tlen)) {
+ tp->t_flags |= TF_DELACK;
+ } else {
+ tp->t_flags |= TF_ACKNOW;
+ tcp_output(tp);
+ }
+ KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
+ __func__, ti_locked));
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ if (tp->t_flags & TF_DELACK) {
+ tp->t_flags &= ~TF_DELACK;
+ tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
+ }
+ INP_WUNLOCK(tp->t_inpcb);
+}
+
+static void
+tcp_do_slowpath(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen,
+ int ti_locked, u_long tiwin, int thflags)
+{
+ int acked, ourfinisacked, needoutput = 0;
+ int rstreason, todrop, win;
+ char *s;
+ struct in_conninfo *inc;
+ struct mbuf *mfree = NULL;
+
+ /*
+ * Calculate amount of space in receive window,
+ * and then do TCP input processing.
+ * Receive window is amount of space in rcv queue,
+ * but not less than advertised window.
+ */
+ inc = &tp->t_inpcb->inp_inc;
+ win = sbspace(&so->so_rcv);
+ if (win < 0)
+ win = 0;
+ tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
+
+ /* Reset receive buffer auto scaling when not in bulk receive mode. */
+ tp->rfbuf_ts = 0;
+ tp->rfbuf_cnt = 0;
+
+ switch (tp->t_state) {
+
+ /*
+ * If the state is SYN_RECEIVED:
+ * if seg contains an ACK, but not for our SYN/ACK, send a RST.
+ */
+ case TCPS_SYN_RECEIVED:
+ if ((thflags & TH_ACK) &&
+ (SEQ_LEQ(th->th_ack, tp->snd_una) ||
+ SEQ_GT(th->th_ack, tp->snd_max))) {
+ rstreason = BANDLIM_RST_OPENPORT;
+ goto dropwithreset;
+ }
+ break;
+
+ /*
+ * If the state is SYN_SENT:
+ * if seg contains an ACK, but not for our SYN, drop the input.
+ * if seg contains a RST, then drop the connection.
+ * if seg does not contain SYN, then drop it.
+ * Otherwise this is an acceptable SYN segment
+ * initialize tp->rcv_nxt and tp->irs
+ * if seg contains ack then advance tp->snd_una
+ * if seg contains an ECE and ECN support is enabled, the stream
+ * is ECN capable.
+ * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
+ * arrange for segment to be acked (eventually)
+ * continue processing rest of data/controls, beginning with URG
+ */
+ case TCPS_SYN_SENT:
+ if ((thflags & TH_ACK) &&
+ (SEQ_LEQ(th->th_ack, tp->iss) ||
+ SEQ_GT(th->th_ack, tp->snd_max))) {
+ rstreason = BANDLIM_UNLIMITED;
+ goto dropwithreset;
+ }
+ if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) {
+ TCP_PROBE5(connect__refused, NULL, tp,
+ mtod(m, const char *), tp, th);
+ tp = tcp_drop(tp, ECONNREFUSED);
+ }
+ if (thflags & TH_RST)
+ goto drop;
+ if (!(thflags & TH_SYN))
+ goto drop;
+
+ tp->irs = th->th_seq;
+ tcp_rcvseqinit(tp);
+ if (thflags & TH_ACK) {
+ TCPSTAT_INC(tcps_connects);
+ soisconnected(so);
+#ifdef MAC
+ mac_socketpeer_set_from_mbuf(m, so);
+#endif
+ /* Do window scaling on this connection? */
+ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
+ (TF_RCVD_SCALE|TF_REQ_SCALE)) {
+ tp->rcv_scale = tp->request_r_scale;
+ }
+ tp->rcv_adv += imin(tp->rcv_wnd,
+ TCP_MAXWIN << tp->rcv_scale);
+ tp->snd_una++; /* SYN is acked */
+ /*
+ * If there's data, delay ACK; if there's also a FIN
+ * ACKNOW will be turned on later.
+ */
+ if (DELAY_ACK(tp, tlen) && tlen != 0)
+ tp->t_fb->tcp_timer_activate(tp, TT_DELACK,
+ tcp_delacktime);
+ else
+ tp->t_flags |= TF_ACKNOW;
+
+ if ((thflags & TH_ECE) && V_tcp_do_ecn) {
+ tp->t_flags |= TF_ECN_PERMIT;
+ TCPSTAT_INC(tcps_ecn_shs);
+ }
+
+ /*
+ * Received <SYN,ACK> in SYN_SENT[*] state.
+ * Transitions:
+ * SYN_SENT --> ESTABLISHED
+ * SYN_SENT* --> FIN_WAIT_1
+ */
+ tp->t_starttime = ticks;
+ if (tp->t_flags & TF_NEEDFIN) {
+ tcp_state_change(tp, TCPS_FIN_WAIT_1);
+ tp->t_flags &= ~TF_NEEDFIN;
+ thflags &= ~TH_SYN;
+ } else {
+ tcp_state_change(tp, TCPS_ESTABLISHED);
+ TCP_PROBE5(connect__established, NULL, tp,
+ mtod(m, const char *), tp, th);
+ cc_conn_init(tp);
+ tp->t_fb->tcp_timer_activate(tp, TT_KEEP,
+ TP_KEEPIDLE(tp));
+ }
+ } else {
+ /*
+ * Received initial SYN in SYN-SENT[*] state =>
+ * simultaneous open.
+ * If it succeeds, connection is * half-synchronized.
+ * Otherwise, do 3-way handshake:
+ * SYN-SENT -> SYN-RECEIVED
+ * SYN-SENT* -> SYN-RECEIVED*
+ */
+ tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
+ tp->t_fb->tcp_timer_activate(tp, TT_REXMT, 0);
+ tcp_state_change(tp, TCPS_SYN_RECEIVED);
+ }
+
+ KASSERT(ti_locked == TI_RLOCKED, ("%s: trimthenstep6: "
+ "ti_locked %d", __func__, ti_locked));
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ /*
+ * Advance th->th_seq to correspond to first data byte.
+ * If data, trim to stay within window,
+ * dropping FIN if necessary.
+ */
+ th->th_seq++;
+ if (tlen > tp->rcv_wnd) {
+ todrop = tlen - tp->rcv_wnd;
+ m_adj(m, -todrop);
+ tlen = tp->rcv_wnd;
+ thflags &= ~TH_FIN;
+ TCPSTAT_INC(tcps_rcvpackafterwin);
+ TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
+ }
+ tp->snd_wl1 = th->th_seq - 1;
+ tp->rcv_up = th->th_seq;
+ /*
+ * Client side of transaction: already sent SYN and data.
+ * If the remote host used T/TCP to validate the SYN,
+ * our data will be ACK'd; if so, enter normal data segment
+ * processing in the middle of step 5, ack processing.
+ * Otherwise, goto step 6.
+ */
+ if (thflags & TH_ACK)
+ goto process_ACK;
+
+ goto step6;
+
+ /*
+ * If the state is LAST_ACK or CLOSING or TIME_WAIT:
+ * do normal processing.
+ *
+ * NB: Leftover from RFC1644 T/TCP. Cases to be reused later.
+ */
+ case TCPS_LAST_ACK:
+ case TCPS_CLOSING:
+ break; /* continue normal processing */
+ }
+
+ /*
+ * States other than LISTEN or SYN_SENT.
+ * First check the RST flag and sequence number since reset segments
+ * are exempt from the timestamp and connection count tests. This
+ * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
+ * below which allowed reset segments in half the sequence space
+ * to fall though and be processed (which gives forged reset
+ * segments with a random sequence number a 50 percent chance of
+ * killing a connection).
+ * Then check timestamp, if present.
+ * Then check the connection count, if present.
+ * Then check that at least some bytes of segment are within
+ * receive window. If segment begins before rcv_nxt,
+ * drop leading data (and SYN); if nothing left, just ack.
+ */
+ if (thflags & TH_RST) {
+ /*
+ * RFC5961 Section 3.2
+ *
+ * - RST drops connection only if SEG.SEQ == RCV.NXT.
+ * - If RST is in window, we send challenge ACK.
+ *
+ * Note: to take into account delayed ACKs, we should
+ * test against last_ack_sent instead of rcv_nxt.
+ * Note 2: we handle special case of closed window, not
+ * covered by the RFC.
+ */
+ if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
+ SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
+ (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ KASSERT(ti_locked == TI_RLOCKED,
+ ("%s: TH_RST ti_locked %d, th %p tp %p",
+ __func__, ti_locked, th, tp));
+ KASSERT(tp->t_state != TCPS_SYN_SENT,
+ ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
+ __func__, th, tp));
+
+ if (V_tcp_insecure_rst ||
+ tp->last_ack_sent == th->th_seq) {
+ TCPSTAT_INC(tcps_drops);
+ /* Drop the connection. */
+ switch (tp->t_state) {
+ case TCPS_SYN_RECEIVED:
+ so->so_error = ECONNREFUSED;
+ goto close;
+ case TCPS_ESTABLISHED:
+ case TCPS_FIN_WAIT_1:
+ case TCPS_FIN_WAIT_2:
+ case TCPS_CLOSE_WAIT:
+ so->so_error = ECONNRESET;
+ close:
+ tcp_state_change(tp, TCPS_CLOSED);
+ /* FALLTHROUGH */
+ default:
+ tp = tcp_close(tp);
+ }
+ } else {
+ TCPSTAT_INC(tcps_badrst);
+ /* Send challenge ACK. */
+ tcp_respond(tp, mtod(m, void *), th, m,
+ tp->rcv_nxt, tp->snd_nxt, TH_ACK);
+ tp->last_ack_sent = tp->rcv_nxt;
+ m = NULL;
+ }
+ }
+ goto drop;
+ }
+
+ /*
+ * RFC5961 Section 4.2
+ * Send challenge ACK for any SYN in synchronized state.
+ */
+ if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT) {
+ KASSERT(ti_locked == TI_RLOCKED,
+ ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked));
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+
+ TCPSTAT_INC(tcps_badsyn);
+ if (V_tcp_insecure_syn &&
+ SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
+ SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
+ tp = tcp_drop(tp, ECONNRESET);
+ rstreason = BANDLIM_UNLIMITED;
+ } else {
+ /* Send challenge ACK. */
+ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
+ tp->snd_nxt, TH_ACK);
+ tp->last_ack_sent = tp->rcv_nxt;
+ m = NULL;
+ }
+ goto drop;
+ }
+
+ /*
+ * RFC 1323 PAWS: If we have a timestamp reply on this segment
+ * and it's less than ts_recent, drop it.
+ */
+ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
+ TSTMP_LT(to->to_tsval, tp->ts_recent)) {
+
+ /* Check to see if ts_recent is over 24 days old. */
+ if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
+ /*
+ * Invalidate ts_recent. If this segment updates
+ * ts_recent, the age will be reset later and ts_recent
+ * will get a valid value. If it does not, setting
+ * ts_recent to zero will at least satisfy the
+ * requirement that zero be placed in the timestamp
+ * echo reply when ts_recent isn't valid. The
+ * age isn't reset until we get a valid ts_recent
+ * because we don't want out-of-order segments to be
+ * dropped when ts_recent is old.
+ */
+ tp->ts_recent = 0;
+ } else {
+ TCPSTAT_INC(tcps_rcvduppack);
+ TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
+ TCPSTAT_INC(tcps_pawsdrop);
+ if (tlen)
+ goto dropafterack;
+ goto drop;
+ }
+ }
+
+ /*
+ * In the SYN-RECEIVED state, validate that the packet belongs to
+ * this connection before trimming the data to fit the receive
+ * window. Check the sequence number versus IRS since we know
+ * the sequence numbers haven't wrapped. This is a partial fix
+ * for the "LAND" DoS attack.
+ */
+ if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
+ rstreason = BANDLIM_RST_OPENPORT;
+ goto dropwithreset;
+ }
+
+ todrop = tp->rcv_nxt - th->th_seq;
+ if (todrop > 0) {
+ if (thflags & TH_SYN) {
+ thflags &= ~TH_SYN;
+ th->th_seq++;
+ if (th->th_urp > 1)
+ th->th_urp--;
+ else
+ thflags &= ~TH_URG;
+ todrop--;
+ }
+ /*
+ * Following if statement from Stevens, vol. 2, p. 960.
+ */
+ if (todrop > tlen
+ || (todrop == tlen && (thflags & TH_FIN) == 0)) {
+ /*
+ * Any valid FIN must be to the left of the window.
+ * At this point the FIN must be a duplicate or out
+ * of sequence; drop it.
+ */
+ thflags &= ~TH_FIN;
+
+ /*
+ * Send an ACK to resynchronize and drop any data.
+ * But keep on processing for RST or ACK.
+ */
+ tp->t_flags |= TF_ACKNOW;
+ todrop = tlen;
+ TCPSTAT_INC(tcps_rcvduppack);
+ TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
+ } else {
+ TCPSTAT_INC(tcps_rcvpartduppack);
+ TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
+ }
+ drop_hdrlen += todrop; /* drop from the top afterwards */
+ th->th_seq += todrop;
+ tlen -= todrop;
+ if (th->th_urp > todrop)
+ th->th_urp -= todrop;
+ else {
+ thflags &= ~TH_URG;
+ th->th_urp = 0;
+ }
+ }
+
+ /*
+ * If new data are received on a connection after the
+ * user processes are gone, then RST the other end.
+ */
+ if ((so->so_state & SS_NOFDREF) &&
+ tp->t_state > TCPS_CLOSE_WAIT && tlen) {
+ KASSERT(ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && "
+ "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked));
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+
+ if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
+ log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data "
+ "after socket was closed, "
+ "sending RST and removing tcpcb\n",
+ s, __func__, tcpstates[tp->t_state], tlen);
+ free(s, M_TCPLOG);
+ }
+ tp = tcp_close(tp);
+ TCPSTAT_INC(tcps_rcvafterclose);
+ rstreason = BANDLIM_UNLIMITED;
+ goto dropwithreset;
+ }
+
+ /*
+ * If segment ends after window, drop trailing data
+ * (and PUSH and FIN); if nothing left, just ACK.
+ */
+ todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
+ if (todrop > 0) {
+ TCPSTAT_INC(tcps_rcvpackafterwin);
+ if (todrop >= tlen) {
+ TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
+ /*
+ * If window is closed can only take segments at
+ * window edge, and have to drop data and PUSH from
+ * incoming segments. Continue processing, but
+ * remember to ack. Otherwise, drop segment
+ * and ack.
+ */
+ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
+ tp->t_flags |= TF_ACKNOW;
+ TCPSTAT_INC(tcps_rcvwinprobe);
+ } else
+ goto dropafterack;
+ } else
+ TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
+ m_adj(m, -todrop);
+ tlen -= todrop;
+ thflags &= ~(TH_PUSH|TH_FIN);
+ }
+
+ /*
+ * If last ACK falls within this segment's sequence numbers,
+ * record its timestamp.
+ * NOTE:
+ * 1) That the test incorporates suggestions from the latest
+ * proposal of the tcplw@cray.com list (Braden 1993/04/26).
+ * 2) That updating only on newer timestamps interferes with
+ * our earlier PAWS tests, so this check should be solely
+ * predicated on the sequence space of this segment.
+ * 3) That we modify the segment boundary check to be
+ * Last.ACK.Sent <= SEG.SEQ + SEG.Len
+ * instead of RFC1323's
+ * Last.ACK.Sent < SEG.SEQ + SEG.Len,
+ * This modified check allows us to overcome RFC1323's
+ * limitations as described in Stevens TCP/IP Illustrated
+ * Vol. 2 p.869. In such cases, we can still calculate the
+ * RTT correctly when RCV.NXT == Last.ACK.Sent.
+ */
+ if ((to->to_flags & TOF_TS) != 0 &&
+ SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
+ SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
+ ((thflags & (TH_SYN|TH_FIN)) != 0))) {
+ tp->ts_recent_age = tcp_ts_getticks();
+ tp->ts_recent = to->to_tsval;
+ }
+
+ /*
+ * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN
+ * flag is on (half-synchronized state), then queue data for
+ * later processing; else drop segment and return.
+ */
+ if ((thflags & TH_ACK) == 0) {
+ if (tp->t_state == TCPS_SYN_RECEIVED ||
+ (tp->t_flags & TF_NEEDSYN))
+ goto step6;
+ else if (tp->t_flags & TF_ACKNOW)
+ goto dropafterack;
+ else
+ goto drop;
+ }
+
+ /*
+ * Ack processing.
+ */
+ switch (tp->t_state) {
+
+ /*
+ * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
+ * ESTABLISHED state and continue processing.
+ * The ACK was checked above.
+ */
+ case TCPS_SYN_RECEIVED:
+
+ TCPSTAT_INC(tcps_connects);
+ soisconnected(so);
+ /* Do window scaling? */
+ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
+ (TF_RCVD_SCALE|TF_REQ_SCALE)) {
+ tp->rcv_scale = tp->request_r_scale;
+ tp->snd_wnd = tiwin;
+ }
+ /*
+ * Make transitions:
+ * SYN-RECEIVED -> ESTABLISHED
+ * SYN-RECEIVED* -> FIN-WAIT-1
+ */
+ tp->t_starttime = ticks;
+ if (tp->t_flags & TF_NEEDFIN) {
+ tcp_state_change(tp, TCPS_FIN_WAIT_1);
+ tp->t_flags &= ~TF_NEEDFIN;
+ } else {
+ tcp_state_change(tp, TCPS_ESTABLISHED);
+ TCP_PROBE5(accept__established, NULL, tp,
+ mtod(m, const char *), tp, th);
+ cc_conn_init(tp);
+ tp->t_fb->tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
+ }
+ /*
+ * If segment contains data or ACK, will call tcp_reass()
+ * later; if not, do so now to pass queued data to user.
+ */
+ if (tlen == 0 && (thflags & TH_FIN) == 0)
+ (void) tcp_reass(tp, (struct tcphdr *)0, 0,
+ (struct mbuf *)0);
+ tp->snd_wl1 = th->th_seq - 1;
+ /* FALLTHROUGH */
+
+ /*
+ * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
+ * ACKs. If the ack is in the range
+ * tp->snd_una < th->th_ack <= tp->snd_max
+ * then advance tp->snd_una to th->th_ack and drop
+ * data from the retransmission queue. If this ACK reflects
+ * more up to date window information we update our window information.
+ */
+ case TCPS_ESTABLISHED:
+ case TCPS_FIN_WAIT_1:
+ case TCPS_FIN_WAIT_2:
+ case TCPS_CLOSE_WAIT:
+ case TCPS_CLOSING:
+ case TCPS_LAST_ACK:
+ if (SEQ_GT(th->th_ack, tp->snd_max)) {
+ TCPSTAT_INC(tcps_rcvacktoomuch);
+ goto dropafterack;
+ }
+ if ((tp->t_flags & TF_SACK_PERMIT) &&
+ ((to->to_flags & TOF_SACK) ||
+ !TAILQ_EMPTY(&tp->snd_holes)))
+ tcp_sack_doack(tp, to, th->th_ack);
+ else
+ /*
+ * Reset the value so that previous (valid) value
+ * from the last ack with SACK doesn't get used.
+ */
+ tp->sackhint.sacked_bytes = 0;
+
+ /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
+ hhook_run_tcp_est_in(tp, th, to);
+
+ if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
+ if (tlen == 0 && tiwin == tp->snd_wnd) {
+ /*
+ * If this is the first time we've seen a
+ * FIN from the remote, this is not a
+ * duplicate and it needs to be processed
+ * normally. This happens during a
+ * simultaneous close.
+ */
+ if ((thflags & TH_FIN) &&
+ (TCPS_HAVERCVDFIN(tp->t_state) == 0)) {
+ tp->t_dupacks = 0;
+ break;
+ }
+ TCPSTAT_INC(tcps_rcvdupack);
+ /*
+ * If we have outstanding data (other than
+ * a window probe), this is a completely
+ * duplicate ack (ie, window info didn't
+ * change and FIN isn't set),
+ * the ack is the biggest we've
+ * seen and we've seen exactly our rexmt
+ * threshhold of them, assume a packet
+ * has been dropped and retransmit it.
+ * Kludge snd_nxt & the congestion
+ * window so we send only this one
+ * packet.
+ *
+ * We know we're losing at the current
+ * window size so do congestion avoidance
+ * (set ssthresh to half the current window
+ * and pull our congestion window back to
+ * the new ssthresh).
+ *
+ * Dup acks mean that packets have left the
+ * network (they're now cached at the receiver)
+ * so bump cwnd by the amount in the receiver
+ * to keep a constant cwnd packets in the
+ * network.
+ *
+ * When using TCP ECN, notify the peer that
+ * we reduced the cwnd.
+ */
+ if (!tp->t_fb->tcp_timer_active(tp, TT_REXMT) ||
+ th->th_ack != tp->snd_una)
+ tp->t_dupacks = 0;
+ else if (++tp->t_dupacks > tcprexmtthresh ||
+ IN_FASTRECOVERY(tp->t_flags)) {
+ cc_ack_received(tp, th, CC_DUPACK);
+ if ((tp->t_flags & TF_SACK_PERMIT) &&
+ IN_FASTRECOVERY(tp->t_flags)) {
+ int awnd;
+
+ /*
+ * Compute the amount of data in flight first.
+ * We can inject new data into the pipe iff
+ * we have less than 1/2 the original window's
+ * worth of data in flight.
+ */
+ if (V_tcp_do_rfc6675_pipe)
+ awnd = tcp_compute_pipe(tp);
+ else
+ awnd = (tp->snd_nxt - tp->snd_fack) +
+ tp->sackhint.sack_bytes_rexmit;
+
+ if (awnd < tp->snd_ssthresh) {
+ tp->snd_cwnd += tp->t_maxseg;
+ if (tp->snd_cwnd > tp->snd_ssthresh)
+ tp->snd_cwnd = tp->snd_ssthresh;
+ }
+ } else
+ tp->snd_cwnd += tp->t_maxseg;
+ (void) tp->t_fb->tcp_output(tp);
+ goto drop;
+ } else if (tp->t_dupacks == tcprexmtthresh) {
+ tcp_seq onxt = tp->snd_nxt;
+
+ /*
+ * If we're doing sack, check to
+ * see if we're already in sack
+ * recovery. If we're not doing sack,
+ * check to see if we're in newreno
+ * recovery.
+ */
+ if (tp->t_flags & TF_SACK_PERMIT) {
+ if (IN_FASTRECOVERY(tp->t_flags)) {
+ tp->t_dupacks = 0;
+ break;
+ }
+ } else {
+ if (SEQ_LEQ(th->th_ack,
+ tp->snd_recover)) {
+ tp->t_dupacks = 0;
+ break;
+ }
+ }
+ /* Congestion signal before ack. */
+ cc_cong_signal(tp, th, CC_NDUPACK);
+ cc_ack_received(tp, th, CC_DUPACK);
+ tp->t_fb->tcp_timer_activate(tp, TT_REXMT, 0);
+ tp->t_rtttime = 0;
+ if (tp->t_flags & TF_SACK_PERMIT) {
+ TCPSTAT_INC(
+ tcps_sack_recovery_episode);
+ tp->sack_newdata = tp->snd_nxt;
+ tp->snd_cwnd = tp->t_maxseg;
+ (void) tp->t_fb->tcp_output(tp);
+ goto drop;
+ }
+ tp->snd_nxt = th->th_ack;
+ tp->snd_cwnd = tp->t_maxseg;
+ (void) tp->t_fb->tcp_output(tp);
+ KASSERT(tp->snd_limited <= 2,
+ ("%s: tp->snd_limited too big",
+ __func__));
+ tp->snd_cwnd = tp->snd_ssthresh +
+ tp->t_maxseg *
+ (tp->t_dupacks - tp->snd_limited);
+ if (SEQ_GT(onxt, tp->snd_nxt))
+ tp->snd_nxt = onxt;
+ goto drop;
+ } else if (V_tcp_do_rfc3042) {
+ /*
+ * Process first and second duplicate
+ * ACKs. Each indicates a segment
+ * leaving the network, creating room
+ * for more. Make sure we can send a
+ * packet on reception of each duplicate
+ * ACK by increasing snd_cwnd by one
+ * segment. Restore the original
+ * snd_cwnd after packet transmission.
+ */
+ cc_ack_received(tp, th, CC_DUPACK);
+ u_long oldcwnd = tp->snd_cwnd;
+ tcp_seq oldsndmax = tp->snd_max;
+ u_int sent;
+ int avail;
+
+ KASSERT(tp->t_dupacks == 1 ||
+ tp->t_dupacks == 2,
+ ("%s: dupacks not 1 or 2",
+ __func__));
+ if (tp->t_dupacks == 1)
+ tp->snd_limited = 0;
+ tp->snd_cwnd =
+ (tp->snd_nxt - tp->snd_una) +
+ (tp->t_dupacks - tp->snd_limited) *
+ tp->t_maxseg;
+ /*
+ * Only call tcp_output when there
+ * is new data available to be sent.
+ * Otherwise we would send pure ACKs.
+ */
+ SOCKBUF_LOCK(&so->so_snd);
+ avail = sbavail(&so->so_snd) -
+ (tp->snd_nxt - tp->snd_una);
+ SOCKBUF_UNLOCK(&so->so_snd);
+ if (avail > 0)
+ (void) tp->t_fb->tcp_output(tp);
+ sent = tp->snd_max - oldsndmax;
+ if (sent > tp->t_maxseg) {
+ KASSERT((tp->t_dupacks == 2 &&
+ tp->snd_limited == 0) ||
+ (sent == tp->t_maxseg + 1 &&
+ tp->t_flags & TF_SENTFIN),
+ ("%s: sent too much",
+ __func__));
+ tp->snd_limited = 2;
+ } else if (sent > 0)
+ ++tp->snd_limited;
+ tp->snd_cwnd = oldcwnd;
+ goto drop;
+ }
+ } else
+ tp->t_dupacks = 0;
+ break;
+ }
+
+ KASSERT(SEQ_GT(th->th_ack, tp->snd_una),
+ ("%s: th_ack <= snd_una", __func__));
+
+ /*
+ * If the congestion window was inflated to account
+ * for the other side's cached packets, retract it.
+ */
+ if (IN_FASTRECOVERY(tp->t_flags)) {
+ if (SEQ_LT(th->th_ack, tp->snd_recover)) {
+ if (tp->t_flags & TF_SACK_PERMIT)
+ tcp_sack_partialack(tp, th);
+ else
+ tcp_newreno_partial_ack(tp, th);
+ } else
+ cc_post_recovery(tp, th);
+ }
+ tp->t_dupacks = 0;
+ /*
+ * If we reach this point, ACK is not a duplicate,
+ * i.e., it ACKs something we sent.
+ */
+ if (tp->t_flags & TF_NEEDSYN) {
+ /*
+ * T/TCP: Connection was half-synchronized, and our
+ * SYN has been ACK'd (so connection is now fully
+ * synchronized). Go to non-starred state,
+ * increment snd_una for ACK of SYN, and check if
+ * we can do window scaling.
+ */
+ tp->t_flags &= ~TF_NEEDSYN;
+ tp->snd_una++;
+ /* Do window scaling? */
+ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
+ (TF_RCVD_SCALE|TF_REQ_SCALE)) {
+ tp->rcv_scale = tp->request_r_scale;
+ /* Send window already scaled. */
+ }
+ }
+
+process_ACK:
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ acked = BYTES_THIS_ACK(tp, th);
+ TCPSTAT_INC(tcps_rcvackpack);
+ TCPSTAT_ADD(tcps_rcvackbyte, acked);
+
+ /*
+ * If we just performed our first retransmit, and the ACK
+ * arrives within our recovery window, then it was a mistake
+ * to do the retransmit in the first place. Recover our
+ * original cwnd and ssthresh, and proceed to transmit where
+ * we left off.
+ */
+ if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID &&
+ (int)(ticks - tp->t_badrxtwin) < 0)
+ cc_cong_signal(tp, th, CC_RTO_ERR);
+
+ /*
+ * If we have a timestamp reply, update smoothed
+ * round trip time. If no timestamp is present but
+ * transmit timer is running and timed sequence
+ * number was acked, update smoothed round trip time.
+ * Since we now have an rtt measurement, cancel the
+ * timer backoff (cf., Phil Karn's retransmit alg.).
+ * Recompute the initial retransmit timer.
+ *
+ * Some boxes send broken timestamp replies
+ * during the SYN+ACK phase, ignore
+ * timestamps of 0 or we could calculate a
+ * huge RTT and blow up the retransmit timer.
+ */
+ if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
+ u_int t;
+
+ t = tcp_ts_getticks() - to->to_tsecr;
+ if (!tp->t_rttlow || tp->t_rttlow > t)
+ tp->t_rttlow = t;
+ tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1);
+ } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) {
+ if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime)
+ tp->t_rttlow = ticks - tp->t_rtttime;
+ tcp_xmit_timer(tp, ticks - tp->t_rtttime);
+ }
+
+ /*
+ * If all outstanding data is acked, stop retransmit
+ * timer and remember to restart (more output or persist).
+ * If there is more data to be acked, restart retransmit
+ * timer, using current (possibly backed-off) value.
+ */
+ if (th->th_ack == tp->snd_max) {
+ tp->t_fb->tcp_timer_activate(tp, TT_REXMT, 0);
+ needoutput = 1;
+ } else if (!tp->t_fb->tcp_timer_active(tp, TT_PERSIST))
+ tp->t_fb->tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
+
+ /*
+ * If no data (only SYN) was ACK'd,
+ * skip rest of ACK processing.
+ */
+ if (acked == 0)
+ goto step6;
+
+ /*
+ * Let the congestion control algorithm update congestion
+ * control related information. This typically means increasing
+ * the congestion window.
+ */
+ cc_ack_received(tp, th, CC_ACK);
+
+ SOCKBUF_LOCK(&so->so_snd);
+ if (acked > sbavail(&so->so_snd)) {
+ tp->snd_wnd -= sbavail(&so->so_snd);
+ mfree = sbcut_locked(&so->so_snd,
+ (int)sbavail(&so->so_snd));
+ ourfinisacked = 1;
+ } else {
+ mfree = sbcut_locked(&so->so_snd, acked);
+ tp->snd_wnd -= acked;
+ ourfinisacked = 0;
+ }
+ /* NB: sowwakeup_locked() does an implicit unlock. */
+ sowwakeup_locked(so);
+ m_freem(mfree);
+ /* Detect una wraparound. */
+ if (!IN_RECOVERY(tp->t_flags) &&
+ SEQ_GT(tp->snd_una, tp->snd_recover) &&
+ SEQ_LEQ(th->th_ack, tp->snd_recover))
+ tp->snd_recover = th->th_ack - 1;
+ /* XXXLAS: Can this be moved up into cc_post_recovery? */
+ if (IN_RECOVERY(tp->t_flags) &&
+ SEQ_GEQ(th->th_ack, tp->snd_recover)) {
+ EXIT_RECOVERY(tp->t_flags);
+ }
+ tp->snd_una = th->th_ack;
+ if (tp->t_flags & TF_SACK_PERMIT) {
+ if (SEQ_GT(tp->snd_una, tp->snd_recover))
+ tp->snd_recover = tp->snd_una;
+ }
+ if (SEQ_LT(tp->snd_nxt, tp->snd_una))
+ tp->snd_nxt = tp->snd_una;
+
+ switch (tp->t_state) {
+
+ /*
+ * In FIN_WAIT_1 STATE in addition to the processing
+ * for the ESTABLISHED state if our FIN is now acknowledged
+ * then enter FIN_WAIT_2.
+ */
+ case TCPS_FIN_WAIT_1:
+ if (ourfinisacked) {
+ /*
+ * If we can't receive any more
+ * data, then closing user can proceed.
+ * Starting the timer is contrary to the
+ * specification, but if we don't get a FIN
+ * we'll hang forever.
+ *
+ * XXXjl:
+ * we should release the tp also, and use a
+ * compressed state.
+ */
+ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+ soisdisconnected(so);
+ tp->t_fb->tcp_timer_activate(tp, TT_2MSL,
+ (tcp_fast_finwait2_recycle ?
+ tcp_finwait2_timeout :
+ TP_MAXIDLE(tp)));
+ }
+ tcp_state_change(tp, TCPS_FIN_WAIT_2);
+ }
+ break;
+
+ /*
+ * In CLOSING STATE in addition to the processing for
+ * the ESTABLISHED state if the ACK acknowledges our FIN
+ * then enter the TIME-WAIT state, otherwise ignore
+ * the segment.
+ */
+ case TCPS_CLOSING:
+ if (ourfinisacked) {
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ tcp_twstart(tp);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ m_freem(m);
+ return;
+ }
+ break;
+
+ /*
+ * In LAST_ACK, we may still be waiting for data to drain
+ * and/or to be acked, as well as for the ack of our FIN.
+ * If our FIN is now acknowledged, delete the TCB,
+ * enter the closed state and return.
+ */
+ case TCPS_LAST_ACK:
+ if (ourfinisacked) {
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ tp = tcp_close(tp);
+ goto drop;
+ }
+ break;
+ }
+ }
+
+step6:
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ /*
+ * Update window information.
+ * Don't look at window if no ACK: TAC's send garbage on first SYN.
+ */
+ if ((thflags & TH_ACK) &&
+ (SEQ_LT(tp->snd_wl1, th->th_seq) ||
+ (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
+ (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
+ /* keep track of pure window updates */
+ if (tlen == 0 &&
+ tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
+ TCPSTAT_INC(tcps_rcvwinupd);
+ tp->snd_wnd = tiwin;
+ tp->snd_wl1 = th->th_seq;
+ tp->snd_wl2 = th->th_ack;
+ if (tp->snd_wnd > tp->max_sndwnd)
+ tp->max_sndwnd = tp->snd_wnd;
+ needoutput = 1;
+ }
+
+ /*
+ * Process segments with URG.
+ */
+ if ((thflags & TH_URG) && th->th_urp &&
+ TCPS_HAVERCVDFIN(tp->t_state) == 0) {
+ /*
+ * This is a kludge, but if we receive and accept
+ * random urgent pointers, we'll crash in
+ * soreceive. It's hard to imagine someone
+ * actually wanting to send this much urgent data.
+ */
+ SOCKBUF_LOCK(&so->so_rcv);
+ if (th->th_urp + sbavail(&so->so_rcv) > sb_max) {
+ th->th_urp = 0; /* XXX */
+ thflags &= ~TH_URG; /* XXX */
+ SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */
+ goto dodata; /* XXX */
+ }
+ /*
+ * If this segment advances the known urgent pointer,
+ * then mark the data stream. This should not happen
+ * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
+ * a FIN has been received from the remote side.
+ * In these states we ignore the URG.
+ *
+ * According to RFC961 (Assigned Protocols),
+ * the urgent pointer points to the last octet
+ * of urgent data. We continue, however,
+ * to consider it to indicate the first octet
+ * of data past the urgent section as the original
+ * spec states (in one of two places).
+ */
+ if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
+ tp->rcv_up = th->th_seq + th->th_urp;
+ so->so_oobmark = sbavail(&so->so_rcv) +
+ (tp->rcv_up - tp->rcv_nxt) - 1;
+ if (so->so_oobmark == 0)
+ so->so_rcv.sb_state |= SBS_RCVATMARK;
+ sohasoutofband(so);
+ tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
+ }
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ /*
+ * Remove out of band data so doesn't get presented to user.
+ * This can happen independent of advancing the URG pointer,
+ * but if two URG's are pending at once, some out-of-band
+ * data may creep in... ick.
+ */
+ if (th->th_urp <= (u_long)tlen &&
+ !(so->so_options & SO_OOBINLINE)) {
+ /* hdr drop is delayed */
+ tcp_pulloutofband(so, th, m, drop_hdrlen);
+ }
+ } else {
+ /*
+ * If no out of band data is expected,
+ * pull receive urgent pointer along
+ * with the receive window.
+ */
+ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
+ tp->rcv_up = tp->rcv_nxt;
+ }
+dodata: /* XXX */
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ /*
+ * Process the segment text, merging it into the TCP sequencing queue,
+ * and arranging for acknowledgment of receipt if necessary.
+ * This process logically involves adjusting tp->rcv_wnd as data
+ * is presented to the user (this happens in tcp_usrreq.c,
+ * case PRU_RCVD). If a FIN has already been received on this
+ * connection then we just ignore the text.
+ */
+ if ((tlen || (thflags & TH_FIN)) &&
+ TCPS_HAVERCVDFIN(tp->t_state) == 0) {
+ tcp_seq save_start = th->th_seq;
+ m_adj(m, drop_hdrlen); /* delayed header drop */
+ /*
+ * Insert segment which includes th into TCP reassembly queue
+ * with control block tp. Set thflags to whether reassembly now
+ * includes a segment with FIN. This handles the common case
+ * inline (segment is the next to be received on an established
+ * connection, and the queue is empty), avoiding linkage into
+ * and removal from the queue and repetition of various
+ * conversions.
+ * Set DELACK for segments received in order, but ack
+ * immediately when segments are out of order (so
+ * fast retransmit can work).
+ */
+ if (th->th_seq == tp->rcv_nxt &&
+ LIST_EMPTY(&tp->t_segq) &&
+ TCPS_HAVEESTABLISHED(tp->t_state)) {
+ if (DELAY_ACK(tp, tlen))
+ tp->t_flags |= TF_DELACK;
+ else
+ tp->t_flags |= TF_ACKNOW;
+ tp->rcv_nxt += tlen;
+ thflags = th->th_flags & TH_FIN;
+ TCPSTAT_INC(tcps_rcvpack);
+ TCPSTAT_ADD(tcps_rcvbyte, tlen);
+ SOCKBUF_LOCK(&so->so_rcv);
+ if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
+ m_freem(m);
+ else
+ sbappendstream_locked(&so->so_rcv, m, 0);
+ /* NB: sorwakeup_locked() does an implicit unlock. */
+ sorwakeup_locked(so);
+ } else {
+ /*
+ * XXX: Due to the header drop above "th" is
+ * theoretically invalid by now. Fortunately
+ * m_adj() doesn't actually frees any mbufs
+ * when trimming from the head.
+ */
+ thflags = tcp_reass(tp, th, &tlen, m);
+ tp->t_flags |= TF_ACKNOW;
+ }
+ if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT))
+ tcp_update_sack_list(tp, save_start, save_start + tlen);
+#if 0
+ /*
+ * Note the amount of data that peer has sent into
+ * our window, in order to estimate the sender's
+ * buffer size.
+ * XXX: Unused.
+ */
+ if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt))
+ len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
+ else
+ len = so->so_rcv.sb_hiwat;
+#endif
+ } else {
+ m_freem(m);
+ thflags &= ~TH_FIN;
+ }
+
+ /*
+ * If FIN is received ACK the FIN and let the user know
+ * that the connection is closing.
+ */
+ if (thflags & TH_FIN) {
+ if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
+ socantrcvmore(so);
+ /*
+ * If connection is half-synchronized
+ * (ie NEEDSYN flag on) then delay ACK,
+ * so it may be piggybacked when SYN is sent.
+ * Otherwise, since we received a FIN then no
+ * more input can be expected, send ACK now.
+ */
+ if (tp->t_flags & TF_NEEDSYN)
+ tp->t_flags |= TF_DELACK;
+ else
+ tp->t_flags |= TF_ACKNOW;
+ tp->rcv_nxt++;
+ }
+ switch (tp->t_state) {
+
+ /*
+ * In SYN_RECEIVED and ESTABLISHED STATES
+ * enter the CLOSE_WAIT state.
+ */
+ case TCPS_SYN_RECEIVED:
+ tp->t_starttime = ticks;
+ /* FALLTHROUGH */
+ case TCPS_ESTABLISHED:
+ tcp_state_change(tp, TCPS_CLOSE_WAIT);
+ break;
+
+ /*
+ * If still in FIN_WAIT_1 STATE FIN has not been acked so
+ * enter the CLOSING state.
+ */
+ case TCPS_FIN_WAIT_1:
+ tcp_state_change(tp, TCPS_CLOSING);
+ break;
+
+ /*
+ * In FIN_WAIT_2 state enter the TIME_WAIT state,
+ * starting the time-wait timer, turning off the other
+ * standard timers.
+ */
+ case TCPS_FIN_WAIT_2:
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ KASSERT(ti_locked == TI_RLOCKED, ("%s: dodata "
+ "TCP_FIN_WAIT_2 ti_locked: %d", __func__,
+ ti_locked));
+
+ tcp_twstart(tp);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ return;
+ }
+ }
+ if (ti_locked == TI_RLOCKED) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ }
+ ti_locked = TI_UNLOCKED;
+
+#ifdef TCPDEBUG
+ if (so->so_options & SO_DEBUG)
+ tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
+ &tcp_savetcp, 0);
+#endif
+ TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
+
+ /*
+ * Return any desired output.
+ */
+ if (needoutput || (tp->t_flags & TF_ACKNOW))
+ (void) tp->t_fb->tcp_output(tp);
+
+ KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
+ __func__, ti_locked));
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ if (tp->t_flags & TF_DELACK) {
+ tp->t_flags &= ~TF_DELACK;
+ tp->t_fb->tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
+ }
+ INP_WUNLOCK(tp->t_inpcb);
+ return;
+
+dropafterack:
+ /*
+ * Generate an ACK dropping incoming segment if it occupies
+ * sequence space, where the ACK reflects our state.
+ *
+ * We can now skip the test for the RST flag since all
+ * paths to this code happen after packets containing
+ * RST have been dropped.
+ *
+ * In the SYN-RECEIVED state, don't send an ACK unless the
+ * segment we received passes the SYN-RECEIVED ACK test.
+ * If it fails send a RST. This breaks the loop in the
+ * "LAND" DoS attack, and also prevents an ACK storm
+ * between two listening ports that have been sent forged
+ * SYN segments, each with the source address of the other.
+ */
+ if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
+ (SEQ_GT(tp->snd_una, th->th_ack) ||
+ SEQ_GT(th->th_ack, tp->snd_max)) ) {
+ rstreason = BANDLIM_RST_OPENPORT;
+ goto dropwithreset;
+ }
+#ifdef TCPDEBUG
+ if (so->so_options & SO_DEBUG)
+ tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
+ &tcp_savetcp, 0);
+#endif
+ TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
+ if (ti_locked == TI_RLOCKED) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ }
+ ti_locked = TI_UNLOCKED;
+
+ tp->t_flags |= TF_ACKNOW;
+ (void) tp->t_fb->tcp_output(tp);
+ INP_WUNLOCK(tp->t_inpcb);
+ m_freem(m);
+ return;
+
+dropwithreset:
+ if (ti_locked == TI_RLOCKED) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ }
+ ti_locked = TI_UNLOCKED;
+
+ if (tp != NULL) {
+ tcp_dropwithreset(m, th, tp, tlen, rstreason);
+ INP_WUNLOCK(tp->t_inpcb);
+ } else
+ tcp_dropwithreset(m, th, NULL, tlen, rstreason);
+ return;
+
+drop:
+ if (ti_locked == TI_RLOCKED) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ ti_locked = TI_UNLOCKED;
+ }
+#ifdef INVARIANTS
+ else
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+#endif
+
+ /*
+ * Drop space held by incoming segment and return.
+ */
+#ifdef TCPDEBUG
+ if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
+ tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
+ &tcp_savetcp, 0);
+#endif
+ TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
+ if (tp != NULL)
+ INP_WUNLOCK(tp->t_inpcb);
+ m_freem(m);
+}
+
+void
+tcp_do_segment_fastslow(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
+ int ti_locked)
+{
+ int thflags;
+ u_long tiwin;
+ char *s;
+ int can_enter;
+ struct in_conninfo *inc;
+ struct tcpopt to;
+
+#ifdef TCPDEBUG
+ /*
+ * The size of tcp_saveipgen must be the size of the max ip header,
+ * now IPv6.
+ */
+ u_char tcp_saveipgen[IP6_HDR_LEN];
+ struct tcphdr tcp_savetcp;
+ short ostate = 0;
+#endif
+ thflags = th->th_flags;
+ tp->sackhint.last_sack_ack = 0;
+ inc = &tp->t_inpcb->inp_inc;
+ /*
+ * If this is either a state-changing packet or current state isn't
+ * established, we require a write lock on tcbinfo. Otherwise, we
+ * allow the tcbinfo to be in either alocked or unlocked, as the
+ * caller may have unnecessarily acquired a write lock due to a race.
+ */
+ if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
+ tp->t_state != TCPS_ESTABLISHED) {
+ KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for "
+ "SYN/FIN/RST/!EST", __func__, ti_locked));
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ } else {
+#ifdef INVARIANTS
+ if (ti_locked == TI_RLOCKED) {
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ } else {
+ KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST "
+ "ti_locked: %d", __func__, ti_locked));
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+ }
+#endif
+ }
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
+ __func__));
+ KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
+ __func__));
+
+ /*
+ * Segment received on connection.
+ * Reset idle time and keep-alive timer.
+ * XXX: This should be done after segment
+ * validation to ignore broken/spoofed segs.
+ */
+ tp->t_rcvtime = ticks;
+ if (TCPS_HAVEESTABLISHED(tp->t_state))
+ tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
+
+ /*
+ * Unscale the window into a 32-bit value.
+ * For the SYN_SENT state the scale is zero.
+ */
+ tiwin = th->th_win << tp->snd_scale;
+
+ /*
+ * TCP ECN processing.
+ */
+ if (tp->t_flags & TF_ECN_PERMIT) {
+ if (thflags & TH_CWR)
+ tp->t_flags &= ~TF_ECN_SND_ECE;
+ switch (iptos & IPTOS_ECN_MASK) {
+ case IPTOS_ECN_CE:
+ tp->t_flags |= TF_ECN_SND_ECE;
+ TCPSTAT_INC(tcps_ecn_ce);
+ break;
+ case IPTOS_ECN_ECT0:
+ TCPSTAT_INC(tcps_ecn_ect0);
+ break;
+ case IPTOS_ECN_ECT1:
+ TCPSTAT_INC(tcps_ecn_ect1);
+ break;
+ }
+ /* Congestion experienced. */
+ if (thflags & TH_ECE) {
+ cc_cong_signal(tp, th, CC_ECN);
+ }
+ }
+
+ /*
+ * Parse options on any incoming segment.
+ */
+ tcp_dooptions(&to, (u_char *)(th + 1),
+ (th->th_off << 2) - sizeof(struct tcphdr),
+ (thflags & TH_SYN) ? TO_SYN : 0);
+
+ /*
+ * If echoed timestamp is later than the current time,
+ * fall back to non RFC1323 RTT calculation. Normalize
+ * timestamp if syncookies were used when this connection
+ * was established.
+ */
+ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
+ to.to_tsecr -= tp->ts_offset;
+ if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks()))
+ to.to_tsecr = 0;
+ }
+ /*
+ * If timestamps were negotiated during SYN/ACK they should
+ * appear on every segment during this session and vice versa.
+ */
+ if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) {
+ if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
+ log(LOG_DEBUG, "%s; %s: Timestamp missing, "
+ "no action\n", s, __func__);
+ free(s, M_TCPLOG);
+ }
+ }
+ if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) {
+ if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
+ log(LOG_DEBUG, "%s; %s: Timestamp not expected, "
+ "no action\n", s, __func__);
+ free(s, M_TCPLOG);
+ }
+ }
+
+ /*
+ * Process options only when we get SYN/ACK back. The SYN case
+ * for incoming connections is handled in tcp_syncache.
+ * According to RFC1323 the window field in a SYN (i.e., a <SYN>
+ * or <SYN,ACK>) segment itself is never scaled.
+ * XXX this is traditional behavior, may need to be cleaned up.
+ */
+ if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
+ if ((to.to_flags & TOF_SCALE) &&
+ (tp->t_flags & TF_REQ_SCALE)) {
+ tp->t_flags |= TF_RCVD_SCALE;
+ tp->snd_scale = to.to_wscale;
+ }
+ /*
+ * Initial send window. It will be updated with
+ * the next incoming segment to the scaled value.
+ */
+ tp->snd_wnd = th->th_win;
+ if (to.to_flags & TOF_TS) {
+ tp->t_flags |= TF_RCVD_TSTMP;
+ tp->ts_recent = to.to_tsval;
+ tp->ts_recent_age = tcp_ts_getticks();
+ }
+ if (to.to_flags & TOF_MSS)
+ tcp_mss(tp, to.to_mss);
+ if ((tp->t_flags & TF_SACK_PERMIT) &&
+ (to.to_flags & TOF_SACKPERM) == 0)
+ tp->t_flags &= ~TF_SACK_PERMIT;
+ }
+ can_enter = 0;
+ if (__predict_true((tlen == 0))) {
+ /*
+ * The ack moved forward and we have a window (non-zero)
+ * <or>
+ * The ack did not move forward, but the window increased.
+ */
+ if (__predict_true((SEQ_GT(th->th_ack, tp->snd_una) && tiwin) ||
+ ((th->th_ack == tp->snd_una) && tiwin && (tiwin > tp->snd_wnd)))) {
+ can_enter = 1;
+ }
+ } else {
+ /*
+ * Data incoming, use the old entry criteria
+ * for fast-path with data.
+ */
+ if ((tiwin && tiwin == tp->snd_wnd)) {
+ can_enter = 1;
+ }
+ }
+ /*
+ * Header prediction: check for the two common cases
+ * of a uni-directional data xfer. If the packet has
+ * no control flags, is in-sequence, the window didn't
+ * change and we're not retransmitting, it's a
+ * candidate. If the length is zero and the ack moved
+ * forward, we're the sender side of the xfer. Just
+ * free the data acked & wake any higher level process
+ * that was blocked waiting for space. If the length
+ * is non-zero and the ack didn't move, we're the
+ * receiver side. If we're getting packets in-order
+ * (the reassembly queue is empty), add the data to
+ * the socket buffer and note that we need a delayed ack.
+ * Make sure that the hidden state-flags are also off.
+ * Since we check for TCPS_ESTABLISHED first, it can only
+ * be TH_NEEDSYN.
+ */
+ if (__predict_true(tp->t_state == TCPS_ESTABLISHED &&
+ th->th_seq == tp->rcv_nxt &&
+ (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
+ tp->snd_nxt == tp->snd_max &&
+ can_enter &&
+ ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
+ LIST_EMPTY(&tp->t_segq) &&
+ ((to.to_flags & TOF_TS) == 0 ||
+ TSTMP_GEQ(to.to_tsval, tp->ts_recent)))) {
+ if (__predict_true((tlen == 0) &&
+ (SEQ_LEQ(th->th_ack, tp->snd_max) &&
+ !IN_RECOVERY(tp->t_flags) &&
+ (to.to_flags & TOF_SACK) == 0 &&
+ TAILQ_EMPTY(&tp->snd_holes)))) {
+ /* We are done */
+ tcp_do_fastack(m, th, so, tp, &to, drop_hdrlen, tlen,
+ ti_locked, tiwin);
+ return;
+ } else if ((tlen) &&
+ (th->th_ack == tp->snd_una &&
+ tlen <= sbspace(&so->so_rcv))) {
+ tcp_do_fastnewdata(m, th, so, tp, &to, drop_hdrlen, tlen,
+ ti_locked, tiwin);
+ /* We are done */
+ return;
+ }
+ }
+ tcp_do_slowpath(m, th, so, tp, &to, drop_hdrlen, tlen,
+ ti_locked, tiwin, thflags);
+}
+
+static int
+tcp_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen,
+ int ti_locked, u_long tiwin)
+{
+ int acked;
+ int winup_only=0;
+
+ if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
+ /* Old ack, behind (or duplicate to) the last one rcv'd */
+ return (0);
+ }
+ if (__predict_false(th->th_ack == tp->snd_una) &&
+ __predict_false(tiwin <= tp->snd_wnd)) {
+ /* duplicate ack <or> a shrinking dup ack with shrinking window */
+ return (0);
+ }
+ if (__predict_false(tiwin == 0)) {
+ /* zero window */
+ return (0);
+ }
+ if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) {
+ /* Above what we have sent? */
+ return (0);
+ }
+ if (__predict_false(tp->snd_nxt != tp->snd_max)) {
+ /* We are retransmitting */
+ return (0);
+ }
+ if (__predict_false(tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN))) {
+ /* We need a SYN or a FIN, unlikely.. */
+ return (0);
+ }
+ if((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) {
+ /* Timestamp is behind .. old ack with seq wrap? */
+ return (0);
+ }
+ if (__predict_false(IN_RECOVERY(tp->t_flags))) {
+ /* Still recovering */
+ return (0);
+ }
+ if (__predict_false(to->to_flags & TOF_SACK)) {
+ /* Sack included in the ack.. */
+ return (0);
+ }
+ if (!TAILQ_EMPTY(&tp->snd_holes)) {
+ /* We have sack holes on our scoreboard */
+ return (0);
+ }
+ /* Ok if we reach here, we can process a fast-ack */
+
+ /* Did the window get updated? */
+ if (tiwin != tp->snd_wnd) {
+ /* keep track of pure window updates */
+ if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) {
+ winup_only = 1;
+ TCPSTAT_INC(tcps_rcvwinupd);
+ }
+ tp->snd_wnd = tiwin;
+ tp->snd_wl1 = th->th_seq;
+ if (tp->snd_wnd > tp->max_sndwnd)
+ tp->max_sndwnd = tp->snd_wnd;
+ }
+ /*
+ * Pull snd_wl2 up to prevent seq wrap relative
+ * to th_ack.
+ */
+ tp->snd_wl2 = th->th_ack;
+ /*
+ * If last ACK falls within this segment's sequence numbers,
+ * record the timestamp.
+ * NOTE that the test is modified according to the latest
+ * proposal of the tcplw@cray.com list (Braden 1993/04/26).
+ */
+ if ((to->to_flags & TOF_TS) != 0 &&
+ SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
+ tp->ts_recent_age = tcp_ts_getticks();
+ tp->ts_recent = to->to_tsval;
+ }
+ /*
+ * This is a pure ack for outstanding data.
+ */
+ if (ti_locked == TI_RLOCKED) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ }
+ ti_locked = TI_UNLOCKED;
+
+ TCPSTAT_INC(tcps_predack);
+
+ /*
+ * "bad retransmit" recovery.
+ */
+ if (tp->t_rxtshift == 1 &&
+ tp->t_flags & TF_PREVVALID &&
+ (int)(ticks - tp->t_badrxtwin) < 0) {
+ cc_cong_signal(tp, th, CC_RTO_ERR);
+ }
+
+ /*
+ * Recalculate the transmit timer / rtt.
+ *
+ * Some boxes send broken timestamp replies
+ * during the SYN+ACK phase, ignore
+ * timestamps of 0 or we could calculate a
+ * huge RTT and blow up the retransmit timer.
+ */
+ if ((to->to_flags & TOF_TS) != 0 &&
+ to->to_tsecr) {
+ u_int t;
+
+ t = tcp_ts_getticks() - to->to_tsecr;
+ if (!tp->t_rttlow || tp->t_rttlow > t)
+ tp->t_rttlow = t;
+ tcp_xmit_timer(tp,
+ TCP_TS_TO_TICKS(t) + 1);
+ } else if (tp->t_rtttime &&
+ SEQ_GT(th->th_ack, tp->t_rtseq)) {
+ if (!tp->t_rttlow ||
+ tp->t_rttlow > ticks - tp->t_rtttime)
+ tp->t_rttlow = ticks - tp->t_rtttime;
+ tcp_xmit_timer(tp,
+ ticks - tp->t_rtttime);
+ }
+ if (winup_only == 0) {
+ acked = BYTES_THIS_ACK(tp, th);
+
+ /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
+ hhook_run_tcp_est_in(tp, th, to);
+
+ TCPSTAT_ADD(tcps_rcvackbyte, acked);
+ sbdrop(&so->so_snd, acked);
+ if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
+ SEQ_LEQ(th->th_ack, tp->snd_recover))
+ tp->snd_recover = th->th_ack - 1;
+
+ /*
+ * Let the congestion control algorithm update
+ * congestion control related information. This
+ * typically means increasing the congestion
+ * window.
+ */
+ cc_ack_received(tp, th, CC_ACK);
+
+ tp->snd_una = th->th_ack;
+ tp->t_dupacks = 0;
+ m_freem(m);
+
+ /*
+ * If all outstanding data are acked, stop
+ * retransmit timer, otherwise restart timer
+ * using current (possibly backed-off) value.
+ * If process is waiting for space,
+ * wakeup/selwakeup/signal. If data
+ * are ready to send, let tcp_output
+ * decide between more output or persist.
+ */
+#ifdef TCPDEBUG
+ if (so->so_options & SO_DEBUG)
+ tcp_trace(TA_INPUT, ostate, tp,
+ (void *)tcp_saveipgen,
+ &tcp_savetcp, 0);
+#endif
+ if (tp->snd_una == tp->snd_max)
+ tcp_timer_activate(tp, TT_REXMT, 0);
+ else if (!tcp_timer_active(tp, TT_PERSIST))
+ tcp_timer_activate(tp, TT_REXMT,
+ tp->t_rxtcur);
+ /* Wake up the socket if we have room to write more */
+ sowwakeup(so);
+ } else {
+ /*
+ * Window update only, just free the mbufs and
+ * send out whatever we can.
+ */
+ m_freem(m);
+ }
+ if (sbavail(&so->so_snd))
+ (void) tcp_output(tp);
+ KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
+ __func__, ti_locked));
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ if (tp->t_flags & TF_DELACK) {
+ tp->t_flags &= ~TF_DELACK;
+ tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
+ }
+ INP_WUNLOCK(tp->t_inpcb);
+ return (1);
+}
+
+void
+tcp_do_segment_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
+ int ti_locked)
+{
+ int thflags;
+ u_long tiwin;
+ char *s;
+ struct in_conninfo *inc;
+ struct tcpopt to;
+
+#ifdef TCPDEBUG
+ /*
+ * The size of tcp_saveipgen must be the size of the max ip header,
+ * now IPv6.
+ */
+ u_char tcp_saveipgen[IP6_HDR_LEN];
+ struct tcphdr tcp_savetcp;
+ short ostate = 0;
+#endif
+ thflags = th->th_flags;
+ tp->sackhint.last_sack_ack = 0;
+ inc = &tp->t_inpcb->inp_inc;
+ /*
+ * If this is either a state-changing packet or current state isn't
+ * established, we require a write lock on tcbinfo. Otherwise, we
+ * allow the tcbinfo to be in either alocked or unlocked, as the
+ * caller may have unnecessarily acquired a write lock due to a race.
+ */
+ if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
+ tp->t_state != TCPS_ESTABLISHED) {
+ KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for "
+ "SYN/FIN/RST/!EST", __func__, ti_locked));
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ } else {
+#ifdef INVARIANTS
+ if (ti_locked == TI_RLOCKED) {
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ } else {
+ KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST "
+ "ti_locked: %d", __func__, ti_locked));
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+ }
+#endif
+ }
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
+ __func__));
+ KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
+ __func__));
+
+ /*
+ * Segment received on connection.
+ * Reset idle time and keep-alive timer.
+ * XXX: This should be done after segment
+ * validation to ignore broken/spoofed segs.
+ */
+ tp->t_rcvtime = ticks;
+ if (TCPS_HAVEESTABLISHED(tp->t_state))
+ tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
+
+ /*
+ * Unscale the window into a 32-bit value.
+ * For the SYN_SENT state the scale is zero.
+ */
+ tiwin = th->th_win << tp->snd_scale;
+
+ /*
+ * TCP ECN processing.
+ */
+ if (tp->t_flags & TF_ECN_PERMIT) {
+ if (thflags & TH_CWR)
+ tp->t_flags &= ~TF_ECN_SND_ECE;
+ switch (iptos & IPTOS_ECN_MASK) {
+ case IPTOS_ECN_CE:
+ tp->t_flags |= TF_ECN_SND_ECE;
+ TCPSTAT_INC(tcps_ecn_ce);
+ break;
+ case IPTOS_ECN_ECT0:
+ TCPSTAT_INC(tcps_ecn_ect0);
+ break;
+ case IPTOS_ECN_ECT1:
+ TCPSTAT_INC(tcps_ecn_ect1);
+ break;
+ }
+ /* Congestion experienced. */
+ if (thflags & TH_ECE) {
+ cc_cong_signal(tp, th, CC_ECN);
+ }
+ }
+
+ /*
+ * Parse options on any incoming segment.
+ */
+ tcp_dooptions(&to, (u_char *)(th + 1),
+ (th->th_off << 2) - sizeof(struct tcphdr),
+ (thflags & TH_SYN) ? TO_SYN : 0);
+
+ /*
+ * If echoed timestamp is later than the current time,
+ * fall back to non RFC1323 RTT calculation. Normalize
+ * timestamp if syncookies were used when this connection
+ * was established.
+ */
+ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
+ to.to_tsecr -= tp->ts_offset;
+ if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks()))
+ to.to_tsecr = 0;
+ }
+ /*
+ * If timestamps were negotiated during SYN/ACK they should
+ * appear on every segment during this session and vice versa.
+ */
+ if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) {
+ if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
+ log(LOG_DEBUG, "%s; %s: Timestamp missing, "
+ "no action\n", s, __func__);
+ free(s, M_TCPLOG);
+ }
+ }
+ if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) {
+ if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
+ log(LOG_DEBUG, "%s; %s: Timestamp not expected, "
+ "no action\n", s, __func__);
+ free(s, M_TCPLOG);
+ }
+ }
+
+ /*
+ * Process options only when we get SYN/ACK back. The SYN case
+ * for incoming connections is handled in tcp_syncache.
+ * According to RFC1323 the window field in a SYN (i.e., a <SYN>
+ * or <SYN,ACK>) segment itself is never scaled.
+ * XXX this is traditional behavior, may need to be cleaned up.
+ */
+ if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
+ if ((to.to_flags & TOF_SCALE) &&
+ (tp->t_flags & TF_REQ_SCALE)) {
+ tp->t_flags |= TF_RCVD_SCALE;
+ tp->snd_scale = to.to_wscale;
+ }
+ /*
+ * Initial send window. It will be updated with
+ * the next incoming segment to the scaled value.
+ */
+ tp->snd_wnd = th->th_win;
+ if (to.to_flags & TOF_TS) {
+ tp->t_flags |= TF_RCVD_TSTMP;
+ tp->ts_recent = to.to_tsval;
+ tp->ts_recent_age = tcp_ts_getticks();
+ }
+ if (to.to_flags & TOF_MSS)
+ tcp_mss(tp, to.to_mss);
+ if ((tp->t_flags & TF_SACK_PERMIT) &&
+ (to.to_flags & TOF_SACKPERM) == 0)
+ tp->t_flags &= ~TF_SACK_PERMIT;
+ }
+ /*
+ * Header prediction: check for the two common cases
+ * of a uni-directional data xfer. If the packet has
+ * no control flags, is in-sequence, the window didn't
+ * change and we're not retransmitting, it's a
+ * candidate. If the length is zero and the ack moved
+ * forward, we're the sender side of the xfer. Just
+ * free the data acked & wake any higher level process
+ * that was blocked waiting for space. If the length
+ * is non-zero and the ack didn't move, we're the
+ * receiver side. If we're getting packets in-order
+ * (the reassembly queue is empty), add the data to
+ * the socket buffer and note that we need a delayed ack.
+ * Make sure that the hidden state-flags are also off.
+ * Since we check for TCPS_ESTABLISHED first, it can only
+ * be TH_NEEDSYN.
+ */
+ if (__predict_true(tp->t_state == TCPS_ESTABLISHED) &&
+ __predict_true(((to.to_flags & TOF_SACK) == 0)) &&
+ __predict_true(tlen == 0) &&
+ __predict_true((thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK) &&
+ __predict_true(LIST_EMPTY(&tp->t_segq)) &&
+ __predict_true(th->th_seq == tp->rcv_nxt)) {
+ if (tcp_fastack(m, th, so, tp, &to, drop_hdrlen, tlen,
+ ti_locked, tiwin)) {
+ return;
+ }
+ }
+ tcp_do_slowpath(m, th, so, tp, &to, drop_hdrlen, tlen,
+ ti_locked, tiwin, thflags);
+}
+
+struct tcp_function_block __tcp_fastslow = {
+ "fastslow",
+ tcp_timer_activate,
+ tcp_timer_active,
+ tcp_timer_stop,
+ tcp_output,
+ tcp_do_segment_fastslow,
+ tcp_default_ctloutput,
+ NULL,
+ NULL,
+ 0
+
+};
+
+struct tcp_function_block __tcp_fastack = {
+ "fastack",
+ tcp_timer_activate,
+ tcp_timer_active,
+ tcp_timer_stop,
+ tcp_output,
+ tcp_do_segment_fastack,
+ tcp_default_ctloutput,
+ NULL,
+ NULL,
+ 0
+
+};
+
+static int
+tcp_addfastpaths(module_t mod, int type, void *data)
+{
+ int err=0;
+
+ switch (type) {
+ case MOD_LOAD:
+ err = register_tcp_functions(&__tcp_fastack, M_WAITOK);
+ if (err) {
+ printf("Failed to register fastack module -- err:%d\n", err);
+ return(err);
+ }
+ err = register_tcp_functions(&__tcp_fastslow, M_WAITOK);
+ if (err) {
+ printf("Failed to register fastslow module -- err:%d\n", err);
+ deregister_tcp_functions(&__tcp_fastack);
+ return(err);
+ }
+ break;
+ case MOD_QUIESCE:
+ if ((__tcp_fastslow.refcnt) ||( __tcp_fastack.refcnt)) {
+ return(EBUSY);
+ }
+ break;
+ case MOD_UNLOAD:
+ err = deregister_tcp_functions(&__tcp_fastack);
+ if (err == EBUSY)
+ break;
+ err = deregister_tcp_functions(&__tcp_fastslow);
+ if (err == EBUSY)
+ break;
+ err = 0;
+ break;
+ default:
+ return (EOPNOTSUPP);
+ }
+ return (err);
+}
+
+static moduledata_t new_tcp_fastpaths = {
+ .name = "tcp_fastpaths",
+ .evhand = tcp_addfastpaths,
+ .priv = 0
+};
+
+MODULE_VERSION(kern_tcpfastpaths, 1);
+DECLARE_MODULE(kern_tcpfastpaths, new_tcp_fastpaths, SI_SUB_PSEUDO, SI_ORDER_ANY);
Index: sys/netinet/tcp.h
===================================================================
--- sys/netinet/tcp.h
+++ sys/netinet/tcp.h
@@ -167,7 +167,7 @@
#define TCP_KEEPCNT 1024 /* L,N number of keepalives before close */
#define TCP_PCAP_OUT 2048 /* number of output packets to keep */
#define TCP_PCAP_IN 4096 /* number of input packets to keep */
-
+#define TCP_FUNCTION_BLK 8192 /* Set the tcp function pointers to the specified stack */
/* Start of reserved space for third-party user-settable options. */
#define TCP_VENDOR SO_VENDOR
@@ -246,4 +246,9 @@
};
#endif
+struct tcp_function_set {
+ char function_set_name[32];
+ uint32_t pcbcnt;
+};
+
#endif /* !_NETINET_TCP_H_ */
Index: sys/netinet/tcp_input.c
===================================================================
--- sys/netinet/tcp_input.c
+++ sys/netinet/tcp_input.c
@@ -230,23 +230,22 @@
#define tcb6 tcb /* for KAME src sync over BSD*'s */
VNET_DEFINE(struct inpcbinfo, tcbinfo);
-static void tcp_dooptions(struct tcpopt *, u_char *, int, int);
-static void tcp_do_segment(struct mbuf *, struct tcphdr *,
- struct socket *, struct tcpcb *, int, int, uint8_t,
- int);
-static void tcp_dropwithreset(struct mbuf *, struct tcphdr *,
+extern void tcp_dooptions(struct tcpopt *, u_char *, int, int);
+void tcp_dropwithreset(struct mbuf *, struct tcphdr *,
struct tcpcb *, int, int);
-static void tcp_pulloutofband(struct socket *,
+void tcp_pulloutofband(struct socket *,
struct tcphdr *, struct mbuf *, int);
-static void tcp_xmit_timer(struct tcpcb *, int);
-static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
-static void inline cc_ack_received(struct tcpcb *tp, struct tcphdr *th,
+void tcp_xmit_timer(struct tcpcb *, int);
+void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
+void cc_ack_received(struct tcpcb *tp, struct tcphdr *th,
uint16_t type);
-static void inline cc_conn_init(struct tcpcb *tp);
-static void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th);
-static void inline hhook_run_tcp_est_in(struct tcpcb *tp,
+void cc_conn_init(struct tcpcb *tp);
+void cc_post_recovery(struct tcpcb *tp, struct tcphdr *th);
+void cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type);
+void hhook_run_tcp_est_in(struct tcpcb *tp,
struct tcphdr *th, struct tcpopt *to);
+void kmod_tcpstat_inc(int statnum);
/*
* TCP statistics are stored in an "array" of counter(9)s.
*/
@@ -272,7 +271,7 @@
/*
* Wrapper for the TCP established input helper hook.
*/
-static void inline
+void
hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to)
{
struct tcp_hhook_data hhook_data;
@@ -290,7 +289,7 @@
/*
* CC wrapper hook functions
*/
-static void inline
+void
cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type)
{
INP_WLOCK_ASSERT(tp->t_inpcb);
@@ -322,7 +321,7 @@
}
}
-static void inline
+void
cc_conn_init(struct tcpcb *tp)
{
struct hc_metrics_lite metrics;
@@ -446,7 +445,7 @@
}
}
-static void inline
+void inline
cc_post_recovery(struct tcpcb *tp, struct tcphdr *th)
{
INP_WLOCK_ASSERT(tp->t_inpcb);
@@ -487,7 +486,7 @@
* connection.
*/
#define DELAY_ACK(tp, tlen) \
- ((!tcp_timer_active(tp, TT_DELACK) && \
+ ((!tp->t_fb->tcp_timer_active(tp, TT_DELACK) && \
(tp->t_flags & TF_RXWIN0SENT) == 0) && \
(tlen <= tp->t_maxopd) && \
(V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
@@ -523,7 +522,7 @@
CC_ALGO(tp)->ecnpkt_handler(tp->ccv);
if (tp->ccv->flags & CCF_ACKNOW)
- tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
+ tp->t_fb->tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
}
}
@@ -1175,7 +1174,7 @@
* contains. tcp_do_segment() consumes
* the mbuf chain and unlocks the inpcb.
*/
- tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen,
+ tp->t_fb->tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen,
iptos, ti_locked);
INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
return (IPPROTO_DONE);
@@ -1421,7 +1420,8 @@
* state. tcp_do_segment() always consumes the mbuf chain, unlocks
* the inpcb, and unlocks pcbinfo.
*/
- tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked);
+ printf("Do segment call ti_locked:%d\n", ti_locked);
+ tp->t_fb->tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked);
INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
return (IPPROTO_DONE);
@@ -1476,7 +1476,7 @@
return (IPPROTO_DONE);
}
-static void
+void
tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
int ti_locked)
@@ -1543,7 +1543,7 @@
*/
tp->t_rcvtime = ticks;
if (TCPS_HAVEESTABLISHED(tp->t_state))
- tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
+ tp->t_fb->tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
/*
* Scale up the window into a 32-bit value.
@@ -1781,13 +1781,13 @@
TCP_PROBE3(debug__input, tp, th,
mtod(m, const char *));
if (tp->snd_una == tp->snd_max)
- tcp_timer_activate(tp, TT_REXMT, 0);
+ tp->t_fb->tcp_timer_activate(tp, TT_REXMT, 0);
else if (!tcp_timer_active(tp, TT_PERSIST))
- tcp_timer_activate(tp, TT_REXMT,
- tp->t_rxtcur);
+ tp->t_fb->tcp_timer_activate(tp, TT_REXMT,
+ tp->t_rxtcur);
sowwakeup(so);
if (sbavail(&so->so_snd))
- (void) tcp_output(tp);
+ (void) tp->t_fb->tcp_output(tp);
goto check_delack;
}
} else if (th->th_ack == tp->snd_una &&
@@ -1906,7 +1906,7 @@
tp->t_flags |= TF_DELACK;
} else {
tp->t_flags |= TF_ACKNOW;
- tcp_output(tp);
+ tp->t_fb->tcp_output(tp);
}
goto check_delack;
}
@@ -1994,7 +1994,7 @@
* ACKNOW will be turned on later.
*/
if (DELAY_ACK(tp, tlen) && tlen != 0)
- tcp_timer_activate(tp, TT_DELACK,
+ tp->t_fb->tcp_timer_activate(tp, TT_DELACK,
tcp_delacktime);
else
tp->t_flags |= TF_ACKNOW;
@@ -2020,7 +2020,7 @@
TCP_PROBE5(connect__established, NULL, tp,
mtod(m, const char *), tp, th);
cc_conn_init(tp);
- tcp_timer_activate(tp, TT_KEEP,
+ tp->t_fb->tcp_timer_activate(tp, TT_KEEP,
TP_KEEPIDLE(tp));
}
} else {
@@ -2033,7 +2033,7 @@
* SYN-SENT* -> SYN-RECEIVED*
*/
tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
- tcp_timer_activate(tp, TT_REXMT, 0);
+ tp->t_fb->tcp_timer_activate(tp, TT_REXMT, 0);
tcp_state_change(tp, TCPS_SYN_RECEIVED);
}
@@ -2391,7 +2391,7 @@
TCP_PROBE5(accept__established, NULL, tp,
mtod(m, const char *), tp, th);
cc_conn_init(tp);
- tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
+ tp->t_fb->tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
}
/*
* If segment contains data or ACK, will call tcp_reass()
@@ -2478,7 +2478,7 @@
* When using TCP ECN, notify the peer that
* we reduced the cwnd.
*/
- if (!tcp_timer_active(tp, TT_REXMT) ||
+ if (!tp->t_fb->tcp_timer_active(tp, TT_REXMT) ||
th->th_ack != tp->snd_una)
tp->t_dupacks = 0;
else if (++tp->t_dupacks > tcprexmtthresh ||
@@ -2507,7 +2507,7 @@
}
} else
tp->snd_cwnd += tp->t_maxseg;
- (void) tcp_output(tp);
+ (void) tp->t_fb->tcp_output(tp);
goto drop;
} else if (tp->t_dupacks == tcprexmtthresh) {
tcp_seq onxt = tp->snd_nxt;
@@ -2534,7 +2534,7 @@
/* Congestion signal before ack. */
cc_cong_signal(tp, th, CC_NDUPACK);
cc_ack_received(tp, th, CC_DUPACK);
- tcp_timer_activate(tp, TT_REXMT, 0);
+ tp->t_fb->tcp_timer_activate(tp, TT_REXMT, 0);
tp->t_rtttime = 0;
if (tp->t_flags & TF_SACK_PERMIT) {
TCPSTAT_INC(
@@ -2541,12 +2541,12 @@
tcps_sack_recovery_episode);
tp->sack_newdata = tp->snd_nxt;
tp->snd_cwnd = tp->t_maxseg;
- (void) tcp_output(tp);
+ (void) tp->t_fb->tcp_output(tp);
goto drop;
}
tp->snd_nxt = th->th_ack;
tp->snd_cwnd = tp->t_maxseg;
- (void) tcp_output(tp);
+ (void) tp->t_fb->tcp_output(tp);
KASSERT(tp->snd_limited <= 2,
("%s: tp->snd_limited too big",
__func__));
@@ -2593,7 +2593,7 @@
(tp->snd_nxt - tp->snd_una);
SOCKBUF_UNLOCK(&so->so_snd);
if (avail > 0)
- (void) tcp_output(tp);
+ (void) tp->t_fb->tcp_output(tp);
sent = tp->snd_max - oldsndmax;
if (sent > tp->t_maxseg) {
KASSERT((tp->t_dupacks == 2 &&
@@ -2704,10 +2704,10 @@
* timer, using current (possibly backed-off) value.
*/
if (th->th_ack == tp->snd_max) {
- tcp_timer_activate(tp, TT_REXMT, 0);
+ tp->t_fb->tcp_timer_activate(tp, TT_REXMT, 0);
needoutput = 1;
- } else if (!tcp_timer_active(tp, TT_PERSIST))
- tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
+ } else if (!tp->t_fb->tcp_timer_active(tp, TT_PERSIST))
+ tp->t_fb->tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
/*
* If no data (only SYN) was ACK'd,
@@ -2777,7 +2777,7 @@
*/
if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
soisdisconnected(so);
- tcp_timer_activate(tp, TT_2MSL,
+ tp->t_fb->tcp_timer_activate(tp, TT_2MSL,
(tcp_fast_finwait2_recycle ?
tcp_finwait2_timeout :
TP_MAXIDLE(tp)));
@@ -3049,7 +3049,7 @@
* Return any desired output.
*/
if (needoutput || (tp->t_flags & TF_ACKNOW))
- (void) tcp_output(tp);
+ (void) tp->t_fb->tcp_output(tp);
check_delack:
KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
@@ -3059,7 +3059,7 @@
if (tp->t_flags & TF_DELACK) {
tp->t_flags &= ~TF_DELACK;
- tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
+ tp->t_fb->tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
}
INP_WUNLOCK(tp->t_inpcb);
return;
@@ -3097,7 +3097,7 @@
ti_locked = TI_UNLOCKED;
tp->t_flags |= TF_ACKNOW;
- (void) tcp_output(tp);
+ (void) tp->t_fb->tcp_output(tp);
INP_WUNLOCK(tp->t_inpcb);
m_freem(m);
return;
@@ -3143,7 +3143,7 @@
* The mbuf must still include the original packet header.
* tp may be NULL.
*/
-static void
+void
tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
int tlen, int rstreason)
{
@@ -3206,7 +3206,7 @@
/*
* Parse TCP options and place in tcpopt.
*/
-static void
+void
tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags)
{
int opt, optlen;
@@ -3300,7 +3300,7 @@
* It is still reflected in the segment length for
* sequencing purposes.
*/
-static void
+void
tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m,
int off)
{
@@ -3333,7 +3333,7 @@
* Collect new round-trip time estimate
* and update averages and current timeout.
*/
-static void
+void
tcp_xmit_timer(struct tcpcb *tp, int rtt)
{
int delta;
@@ -3713,7 +3713,7 @@
* By setting snd_nxt to ti_ack, this forces retransmission timer to
* be started again.
*/
-static void
+void
tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
{
tcp_seq onxt = tp->snd_nxt;
@@ -3721,7 +3721,7 @@
INP_WLOCK_ASSERT(tp->t_inpcb);
- tcp_timer_activate(tp, TT_REXMT, 0);
+ tp->t_fb->tcp_timer_activate(tp, TT_REXMT, 0);
tp->t_rtttime = 0;
tp->snd_nxt = th->th_ack;
/*
@@ -3730,7 +3730,7 @@
*/
tp->snd_cwnd = tp->t_maxseg + BYTES_THIS_ACK(tp, th);
tp->t_flags |= TF_ACKNOW;
- (void) tcp_output(tp);
+ (void) tp->t_fb->tcp_output(tp);
tp->snd_cwnd = ocwnd;
if (SEQ_GT(onxt, tp->snd_nxt))
tp->snd_nxt = onxt;
Index: sys/netinet/tcp_output.c
===================================================================
--- sys/netinet/tcp_output.c
+++ sys/netinet/tcp_output.c
@@ -329,7 +329,7 @@
flags &= ~TH_FIN;
sendwin = 1;
} else {
- tcp_timer_activate(tp, TT_PERSIST, 0);
+ tp->t_fb->tcp_timer_activate(tp, TT_PERSIST, 0);
tp->t_rxtshift = 0;
}
}
@@ -423,10 +423,10 @@
len = 0;
if ((sendwin == 0) && (TCPS_HAVEESTABLISHED(tp->t_state)) &&
(off < (int) sbavail(&so->so_snd))) {
- tcp_timer_activate(tp, TT_REXMT, 0);
+ tp->t_fb->tcp_timer_activate(tp, TT_REXMT, 0);
tp->t_rxtshift = 0;
tp->snd_nxt = tp->snd_una;
- if (!tcp_timer_active(tp, TT_PERSIST))
+ if (!tp->t_fb->tcp_timer_active(tp, TT_PERSIST))
tcp_setpersist(tp);
}
}
@@ -648,9 +648,9 @@
*/
if ((tp->t_flags & TF_SACK_PERMIT) &&
SEQ_GT(tp->snd_max, tp->snd_una) &&
- !tcp_timer_active(tp, TT_REXMT) &&
- !tcp_timer_active(tp, TT_PERSIST)) {
- tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
+ !tp->t_fb->tcp_timer_active(tp, TT_REXMT) &&
+ !tp->t_fb->tcp_timer_active(tp, TT_PERSIST)) {
+ tp->t_fb->tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
goto just_return;
}
/*
@@ -675,8 +675,8 @@
* if window is nonzero, transmit what we can,
* otherwise force out a byte.
*/
- if (sbavail(&so->so_snd) && !tcp_timer_active(tp, TT_REXMT) &&
- !tcp_timer_active(tp, TT_PERSIST)) {
+ if (sbavail(&so->so_snd) && !tp->t_fb->tcp_timer_active(tp, TT_REXMT) &&
+ !tp->t_fb->tcp_timer_active(tp, TT_PERSIST)) {
tp->t_rxtshift = 0;
tcp_setpersist(tp);
}
@@ -1118,7 +1118,7 @@
*/
if (sack_rxmit == 0) {
if (len || (flags & (TH_SYN|TH_FIN)) ||
- tcp_timer_active(tp, TT_PERSIST))
+ tp->t_fb->tcp_timer_active(tp, TT_PERSIST))
th->th_seq = htonl(tp->snd_nxt);
else
th->th_seq = htonl(tp->snd_max);
@@ -1377,7 +1377,7 @@
* the retransmit. In persist state, just set snd_max.
*/
if ((tp->t_flags & TF_FORCEDATA) == 0 ||
- !tcp_timer_active(tp, TT_PERSIST)) {
+ !tp->t_fb->tcp_timer_active(tp, TT_PERSIST)) {
tcp_seq startseq = tp->snd_nxt;
/*
@@ -1416,17 +1416,17 @@
* of retransmit time.
*/
timer:
- if (!tcp_timer_active(tp, TT_REXMT) &&
+ if (!tp->t_fb->tcp_timer_active(tp, TT_REXMT) &&
((sack_rxmit && tp->snd_nxt != tp->snd_max) ||
(tp->snd_nxt != tp->snd_una))) {
- if (tcp_timer_active(tp, TT_PERSIST)) {
- tcp_timer_activate(tp, TT_PERSIST, 0);
+ if (tp->t_fb->tcp_timer_active(tp, TT_PERSIST)) {
+ tp->t_fb->tcp_timer_activate(tp, TT_PERSIST, 0);
tp->t_rxtshift = 0;
}
- tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
+ tp->t_fb->tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
} else if (len == 0 && sbavail(&so->so_snd) &&
- !tcp_timer_active(tp, TT_REXMT) &&
- !tcp_timer_active(tp, TT_PERSIST)) {
+ !tp->t_fb->tcp_timer_active(tp, TT_REXMT) &&
+ !tp->t_fb->tcp_timer_active(tp, TT_PERSIST)) {
/*
* Avoid a situation where we do not set persist timer
* after a zero window condition. For example:
@@ -1481,7 +1481,7 @@
* away would be the really correct behavior instead.
*/
if (((tp->t_flags & TF_FORCEDATA) == 0 ||
- !tcp_timer_active(tp, TT_PERSIST)) &&
+ !tp->t_fb->tcp_timer_active(tp, TT_PERSIST)) &&
((flags & TH_SYN) == 0) &&
(error != EPERM)) {
if (sack_rxmit) {
@@ -1498,9 +1498,9 @@
tp->t_softerror = error;
return (error);
case ENOBUFS:
- if (!tcp_timer_active(tp, TT_REXMT) &&
- !tcp_timer_active(tp, TT_PERSIST))
- tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
+ if (!tp->t_fb->tcp_timer_active(tp, TT_REXMT) &&
+ !tp->t_fb->tcp_timer_active(tp, TT_PERSIST))
+ tp->t_fb->tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
tp->snd_cwnd = tp->t_maxseg;
return (0);
case EMSGSIZE:
@@ -1545,8 +1545,8 @@
tp->rcv_adv = tp->rcv_nxt + recwin;
tp->last_ack_sent = tp->rcv_nxt;
tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
- if (tcp_timer_active(tp, TT_DELACK))
- tcp_timer_activate(tp, TT_DELACK, 0);
+ if (tp->t_fb->tcp_timer_active(tp, TT_DELACK))
+ tp->t_fb->tcp_timer_activate(tp, TT_DELACK, 0);
#if 0
/*
* This completely breaks TCP if newreno is turned on. What happens
@@ -1569,7 +1569,7 @@
int tt;
tp->t_flags &= ~TF_PREVVALID;
- if (tcp_timer_active(tp, TT_REXMT))
+ if (tp->t_fb->tcp_timer_active(tp, TT_REXMT))
panic("tcp_setpersist: retransmit pending");
/*
* Start/restart persistance timer.
@@ -1576,7 +1576,7 @@
*/
TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
TCPTV_PERSMIN, TCPTV_PERSMAX);
- tcp_timer_activate(tp, TT_PERSIST, tt);
+ tp->t_fb->tcp_timer_activate(tp, TT_PERSIST, tt);
if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
tp->t_rxtshift++;
}
Index: sys/netinet/tcp_sack.c
===================================================================
--- sys/netinet/tcp_sack.c
+++ sys/netinet/tcp_sack.c
@@ -579,7 +579,7 @@
int num_segs = 1;
INP_WLOCK_ASSERT(tp->t_inpcb);
- tcp_timer_activate(tp, TT_REXMT, 0);
+ tp->t_fb->tcp_timer_activate(tp, TT_REXMT, 0);
tp->t_rtttime = 0;
/* Send one or 2 segments based on how much new data was acked. */
if ((BYTES_THIS_ACK(tp, th) / tp->t_maxseg) >= 2)
@@ -589,7 +589,7 @@
if (tp->snd_cwnd > tp->snd_ssthresh)
tp->snd_cwnd = tp->snd_ssthresh;
tp->t_flags |= TF_ACKNOW;
- (void) tcp_output(tp);
+ (void) tp->t_fb->tcp_output(tp);
}
#if 0
Index: sys/netinet/tcp_subr.c
===================================================================
--- sys/netinet/tcp_subr.c
+++ sys/netinet/tcp_subr.c
@@ -47,6 +47,7 @@
#include <sys/sysctl.h>
#include <sys/jail.h>
#include <sys/malloc.h>
+#include <sys/refcount.h>
#include <sys/mbuf.h>
#ifdef INET6
#include <sys/domain.h>
@@ -125,6 +126,8 @@
VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS;
#endif
+struct rwlock tcp_function_lock;
+
static int
sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS)
{
@@ -146,6 +149,7 @@
&sysctl_net_inet_tcp_mss_check, "I",
"Default TCP Maximum Segment Size");
+
#ifdef INET6
static int
sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS)
@@ -236,6 +240,162 @@
void *ip4hdr, const void *ip6hdr);
static void tcp_timer_discard(struct tcpcb *, uint32_t);
+
+static struct tcp_function_block tcp_def_funcblk = {
+ "default",
+ tcp_timer_activate,
+ tcp_timer_active,
+ tcp_timer_stop,
+ tcp_output,
+ tcp_do_segment,
+ tcp_default_ctloutput,
+ NULL,
+ NULL,
+ 0
+};
+
+struct tcp_funchead t_functions;
+static struct tcp_function_block *tcp_func_set_ptr = &tcp_def_funcblk;
+
+static struct tcp_function_block *
+find_tcp_functions_locked(struct tcp_function_set *fs)
+{
+ struct tcp_function *f;
+ struct tcp_function_block *blk=NULL;
+
+ TAILQ_FOREACH(f, &t_functions, next) {
+ if (strcmp(f->fb->tcp_block_name, fs->function_set_name) == 0) {
+ blk = f->fb;
+ break;
+ }
+ }
+ return(blk);
+}
+
+struct tcp_function_block *
+find_tcp_functions(struct tcp_function_set *fs)
+{
+ struct tcp_function_block *blk;
+
+ rw_rlock(&tcp_function_lock);
+ blk = find_tcp_functions_locked(fs);
+ if (blk)
+ refcount_acquire(&blk->refcnt);
+ rw_runlock(&tcp_function_lock);
+ return(blk);
+}
+
+struct tcp_function_block *
+find_and_ref_tcp_fb(struct tcp_function_block *blk)
+{
+ struct tcp_function *f;
+ struct tcp_function_block *rblk=NULL;
+
+ rw_rlock(&tcp_function_lock);
+ TAILQ_FOREACH(f, &t_functions, next) {
+ if (f->fb == blk) {
+ refcount_acquire(&blk->refcnt);
+ rblk = blk;
+ }
+ }
+ rw_runlock(&tcp_function_lock);
+ return(rblk);
+}
+
+
+static int
+sysctl_net_inet_tcp_functions(SYSCTL_HANDLER_ARGS)
+{
+ int error=ENOENT;
+ struct tcp_function *f;
+ struct tcp_function_set fs;
+ struct tcp_function_block *blk;
+
+ memset(&fs, 0, sizeof(fs));
+ rw_wlock(&tcp_function_lock);
+ if (req->oldptr) {
+ TAILQ_FOREACH(f, &t_functions, next) {
+ if (f->fb == tcp_func_set_ptr) {
+ /* Found him */
+ strcpy(fs.function_set_name, f->fb->tcp_block_name);
+ fs.pcbcnt = f->fb->refcnt;
+ error = SYSCTL_OUT(req, &fs, sizeof(fs));
+ if (error) {
+ goto done;
+ }
+ break;
+ }
+ }
+ }
+ if (req->newptr == NULL) {
+ goto done;
+ }
+ error = SYSCTL_IN(req, &fs, sizeof(fs));
+ if (error) {
+ goto done;
+ }
+ blk = find_tcp_functions_locked(&fs);
+ if (blk == NULL) {
+ error = ENOENT;
+ goto done;
+ }
+ tcp_func_set_ptr = blk;
+done:
+ rw_wunlock(&tcp_function_lock);
+ return (error);
+}
+
+
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, setfunctions,
+ CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_RW, 0, 0,
+ &sysctl_net_inet_tcp_functions, "I",
+ "Set the default TCP function stack");
+
+static int
+sysctl_net_inet_tcp_functions_list(SYSCTL_HANDLER_ARGS)
+{
+ int error=0, n, at;
+ struct tcp_function *f;
+ struct tcp_function_set *fs=NULL;
+ size_t sz;
+
+ rw_rlock(&tcp_function_lock);
+ n = 0;
+ TAILQ_FOREACH(f, &t_functions, next) {
+ n++;
+ }
+ rw_runlock(&tcp_function_lock);
+ if (req->oldptr == NULL) {
+ req->oldidx = ((n+2) * sizeof(struct tcp_function_set));
+ return(0);
+ }
+ sz = n * sizeof(struct tcp_function_set);
+ fs = malloc(sz, M_TEMP, M_WAITOK);
+ if (fs == NULL) {
+ return(ENOMEM);
+ }
+ at = 0;
+ memset(fs, 0, sz);
+ rw_rlock(&tcp_function_lock);
+ TAILQ_FOREACH(f, &t_functions, next) {
+ strcpy(fs[at].function_set_name, f->fb->tcp_block_name);
+ fs[at].pcbcnt = f->fb->refcnt;
+ at++;
+ if (at > n)
+ break;
+ }
+ rw_runlock(&tcp_function_lock);
+ error = SYSCTL_OUT(req, fs, sz);
+ free(fs, M_TEMP);
+ return (error);
+}
+
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, listfunctions,
+ CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_RW, 0, 0,
+ &sysctl_net_inet_tcp_functions_list, "I",
+ "List the TCP function stacks and there reference counts");
+
+
/*
* Target size of TCP PCB hash tables. Must be a power of two.
*
@@ -263,6 +423,8 @@
#define V_tcpcb_zone VNET(tcpcb_zone)
MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers");
+MALLOC_DEFINE(M_TCPFUNCTIONS, "tcpfunc", "TCP function set memory");
+
static struct mtx isn_mtx;
#define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF)
@@ -311,6 +473,76 @@
return (hashsize);
}
+int
+register_tcp_functions(struct tcp_function_block *blk, int wait)
+{
+ struct tcp_function *n, *f;
+
+ if ((blk->tcp_timer_activate == NULL) ||
+ (blk->tcp_timer_active == NULL) ||
+ (blk->tcp_timer_stop == NULL) ||
+ (blk->tcp_output == NULL) ||
+ (blk->tcp_do_segment == NULL) ||
+ (blk->tcp_ctloutput == NULL) ||
+ (strlen(blk->tcp_block_name) == 0)) {
+ /*
+ * The only functions you are allowed
+ * not to define is the tcp_fb_init and tcp_fb_fini.
+ */
+ return (EINVAL);
+ }
+ n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait);
+ if (n == NULL) {
+ return (ENOMEM);
+ }
+ n->fb = blk;
+ rw_wlock(&tcp_function_lock);
+ TAILQ_FOREACH(f, &t_functions, next) {
+ if (strcmp(f->fb->tcp_block_name, blk->tcp_block_name) == 0) {
+ /* Duplicate name space not allowed */
+ rw_wunlock(&tcp_function_lock);
+ return (EALREADY);
+ }
+ }
+ refcount_init(&blk->refcnt, 0);
+ TAILQ_INSERT_TAIL(&t_functions, n, next);
+ rw_wunlock(&tcp_function_lock);
+ return(0);
+}
+
+int
+deregister_tcp_functions(struct tcp_function_block *blk)
+{
+ struct tcp_function *f;
+ int error=ENOENT;
+
+ if (strcmp(blk->tcp_block_name, "default") == 0) {
+ /* You can't un-register the default */
+ return (EPERM);
+ }
+ if (blk == tcp_func_set_ptr) {
+ /* You can't free the current default */
+ return (EBUSY);
+ }
+ if (blk->refcnt) {
+ /* Still tcb attached */
+ return (EBUSY);
+ }
+ rw_wlock(&tcp_function_lock);
+ TAILQ_FOREACH(f, &t_functions, next) {
+ if (f->fb == blk) {
+ /* Found */
+ TAILQ_REMOVE(&t_functions, f, next);
+ f->fb = NULL;
+ free(f, M_TCPFUNCTIONS);
+ error = 0;
+ break;
+ }
+ }
+ rw_wunlock(&tcp_function_lock);
+ return (error);
+}
+
void
tcp_init(void)
{
@@ -318,7 +550,6 @@
int hashsize;
tcbhash_tuneable = "net.inet.tcp.tcbhashsize";
-
if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN,
&V_tcp_hhh[HHOOK_TCP_EST_IN], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
printf("%s: WARNING: unable to register helper hook\n", __func__);
@@ -325,7 +556,10 @@
if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT,
&V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
printf("%s: WARNING: unable to register helper hook\n", __func__);
-
+ /* Setup the tcp function block list */
+ TAILQ_INIT(&t_functions);
+ rw_init_flags(&tcp_function_lock, "tcp_func_lock" , 0);
+ register_tcp_functions(&tcp_def_funcblk, M_WAITOK);
hashsize = TCBHASHSIZE;
TUNABLE_INT_FETCH(tcbhash_tuneable, &hashsize);
if (hashsize == 0) {
@@ -768,7 +1002,13 @@
tp->ccv = &tm->ccv;
tp->ccv->type = IPPROTO_TCP;
tp->ccv->ccvc.tcp = tp;
-
+ rw_rlock(&tcp_function_lock);
+ tp->t_fb = tcp_func_set_ptr;
+ refcount_acquire(&tp->t_fb->refcnt);
+ rw_runlock(&tcp_function_lock);
+ if (tp->t_fb->tcp_fb_init) {
+ (*tp->t_fb->tcp_fb_init)(tp);
+ }
/*
* Use the current system default CC algorithm.
*/
@@ -779,6 +1019,9 @@
if (CC_ALGO(tp)->cb_init != NULL)
if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) {
+ if (tp->t_fb->tcp_fb_fini)
+ (*tp->t_fb->tcp_fb_fini)(tp);
+ refcount_release(&tp->t_fb->refcnt);
uma_zfree(V_tcpcb_zone, tm);
return (NULL);
}
@@ -785,6 +1028,9 @@
tp->osd = &tm->osd;
if (khelp_init_osd(HELPER_CLASS_TCP, tp->osd)) {
+ if (tp->t_fb->tcp_fb_fini)
+ (*tp->t_fb->tcp_fb_fini)(tp);
+ refcount_release(&tp->t_fb->refcnt);
uma_zfree(V_tcpcb_zone, tm);
return (NULL);
}
@@ -925,7 +1171,7 @@
if (TCPS_HAVERCVDSYN(tp->t_state)) {
tcp_state_change(tp, TCPS_CLOSED);
- (void) tcp_output(tp);
+ (void) tp->t_fb->tcp_output(tp);
TCPSTAT_INC(tcps_drops);
} else
TCPSTAT_INC(tcps_conndrops);
@@ -955,11 +1201,11 @@
* callout, and the last discard function called will take care of
* deleting the tcpcb.
*/
- tcp_timer_stop(tp, TT_REXMT);
- tcp_timer_stop(tp, TT_PERSIST);
- tcp_timer_stop(tp, TT_KEEP);
- tcp_timer_stop(tp, TT_2MSL);
- tcp_timer_stop(tp, TT_DELACK);
+ tp->t_fb->tcp_timer_stop(tp, TT_REXMT);
+ tp->t_fb->tcp_timer_stop(tp, TT_PERSIST);
+ tp->t_fb->tcp_timer_stop(tp, TT_KEEP);
+ tp->t_fb->tcp_timer_stop(tp, TT_2MSL);
+ tp->t_fb->tcp_timer_stop(tp, TT_DELACK);
/*
* If we got enough samples through the srtt filter,
@@ -1044,6 +1290,9 @@
inp->inp_ppcb = NULL;
if ((tp->t_timers->tt_flags & TT_MASK) == 0) {
/* We own the last reference on tcpcb, let's free it. */
+ if (tp->t_fb->tcp_fb_fini)
+ (*tp->t_fb->tcp_fb_fini)(tp);
+ refcount_release(&tp->t_fb->refcnt);
tp->t_inpcb = NULL;
uma_zfree(V_tcpcb_zone, tp);
released = in_pcbrele_wlocked(inp);
@@ -1105,6 +1354,9 @@
tp->t_timers->tt_flags &= ~timer_type;
if ((tp->t_timers->tt_flags & TT_MASK) == 0) {
/* We own the last reference on this tcpcb, let's free it. */
+ if (tp->t_fb->tcp_fb_fini)
+ (*tp->t_fb->tcp_fb_fini)(tp);
+ refcount_release(&tp->t_fb->refcnt);
tp->t_inpcb = NULL;
uma_zfree(V_tcpcb_zone, tp);
if (in_pcbrele_wlocked(inp)) {
@@ -1865,7 +2117,7 @@
tp->snd_recover = tp->snd_max;
if (tp->t_flags & TF_SACK_PERMIT)
EXIT_FASTRECOVERY(tp->t_flags);
- tcp_output(tp);
+ tp->t_fb->tcp_output(tp);
}
#ifdef INET
Index: sys/netinet/tcp_syncache.h
===================================================================
--- sys/netinet/tcp_syncache.h
+++ sys/netinet/tcp_syncache.h
@@ -52,6 +52,7 @@
struct syncache {
TAILQ_ENTRY(syncache) sc_hash;
struct in_conninfo sc_inc; /* addresses */
+ struct tcp_function_block *sc_t_fb; /* Function block parent was using */
int sc_rxttime; /* retransmit time */
u_int16_t sc_rxmits; /* retransmit counter */
u_int32_t sc_tsreflect; /* timestamp to reflect */
Index: sys/netinet/tcp_syncache.c
===================================================================
--- sys/netinet/tcp_syncache.c
+++ sys/netinet/tcp_syncache.c
@@ -41,6 +41,7 @@
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/hash.h>
+#include <sys/refcount.h>
#include <sys/kernel.h>
#include <sys/sysctl.h>
#include <sys/limits.h>
@@ -817,6 +818,26 @@
tp->irs = sc->sc_irs;
tcp_rcvseqinit(tp);
tcp_sendseqinit(tp);
+ if (sc->sc_t_fb && (sc->sc_t_fb != tp->t_fb)) {
+ /*
+ * Our parents t_fb was not the default,
+ * is it still valid, if so we need to
+ * release our ref on tp->t_fb and pickup
+ * one on the new entry.
+ */
+ struct tcp_function_block *blk;
+
+ blk = find_and_ref_tcp_fb(sc->sc_t_fb);
+ if (blk) {
+ if (tp->t_fb->tcp_fb_fini)
+ (*tp->t_fb->tcp_fb_fini)(tp);
+ refcount_release(&tp->t_fb->refcnt);
+ tp->t_fb = blk;
+ if (tp->t_fb->tcp_fb_init) {
+ (*tp->t_fb->tcp_fb_init)(tp);
+ }
+ }
+ }
tp->snd_wl1 = sc->sc_irs;
tp->snd_max = tp->iss + 1;
tp->snd_nxt = tp->iss + 1;
@@ -884,7 +905,7 @@
tp->t_keepidle = sototcpcb(lso)->t_keepidle;
tp->t_keepintvl = sototcpcb(lso)->t_keepintvl;
tp->t_keepcnt = sototcpcb(lso)->t_keepcnt;
- tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
+ tp->t_fb->tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
soisconnected(so);
@@ -1094,6 +1115,7 @@
struct label *maclabel;
#endif
struct syncache scs;
+ struct tcp_function_block *blk;
struct ucred *cred;
INP_WLOCK_ASSERT(inp); /* listen socket */
@@ -1118,7 +1140,7 @@
win = sbspace(&so->so_rcv);
sb_hiwat = so->so_rcv.sb_hiwat;
ltflags = (tp->t_flags & (TF_NOOPT | TF_SIGNATURE));
-
+ blk = tp->t_fb;
/* By the time we drop the lock these should no longer be used. */
so = NULL;
tp = NULL;
@@ -1248,6 +1270,7 @@
#endif
sc->sc_irs = th->th_seq;
sc->sc_iss = arc4random();
+ sc->sc_t_fb = blk;
sc->sc_flags = 0;
sc->sc_flowlabel = 0;
Index: sys/netinet/tcp_timer.c
===================================================================
--- sys/netinet/tcp_timer.c
+++ sys/netinet/tcp_timer.c
@@ -292,7 +292,7 @@
tp->t_flags |= TF_ACKNOW;
TCPSTAT_INC(tcps_delack);
- (void) tcp_output(tp);
+ (void) tp->t_fb->tcp_output(tp);
INP_WUNLOCK(inp);
CURVNET_RESTORE();
}
@@ -543,7 +543,7 @@
}
tcp_setpersist(tp);
tp->t_flags |= TF_FORCEDATA;
- (void) tcp_output(tp);
+ (void) tp->t_fb->tcp_output(tp);
tp->t_flags &= ~TF_FORCEDATA;
out:
@@ -798,7 +798,7 @@
cc_cong_signal(tp, NULL, CC_RTO);
- (void) tcp_output(tp);
+ (void) tp->t_fb->tcp_output(tp);
out:
#ifdef TCPDEBUG
Index: sys/netinet/tcp_usrreq.c
===================================================================
--- sys/netinet/tcp_usrreq.c
+++ sys/netinet/tcp_usrreq.c
@@ -47,6 +47,7 @@
#include <sys/systm.h>
#include <sys/limits.h>
#include <sys/malloc.h>
+#include <sys/refcount.h>
#include <sys/kernel.h>
#include <sys/sysctl.h>
#include <sys/mbuf.h>
@@ -508,8 +509,8 @@
(error = tcp_offload_connect(so, nam)) == 0)
goto out;
#endif
- tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
- error = tcp_output(tp);
+ tp->t_fb->tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
+ error = tp->t_fb->tcp_output(tp);
out:
TCPDEBUG2(PRU_CONNECT);
INP_WUNLOCK(inp);
@@ -579,7 +580,7 @@
(error = tcp_offload_connect(so, nam)) == 0)
goto out;
#endif
- error = tcp_output(tp);
+ error = tp->t_fb->tcp_output(tp);
goto out;
}
#endif
@@ -596,8 +597,8 @@
(error = tcp_offload_connect(so, nam)) == 0)
goto out;
#endif
- tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
- error = tcp_output(tp);
+ tp->t_fb->tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
+ error = tp->t_fb->tcp_output(tp);
out:
TCPDEBUG2(PRU_CONNECT);
@@ -773,7 +774,7 @@
socantsendmore(so);
tcp_usrclosed(tp);
if (!(inp->inp_flags & INP_DROPPED))
- error = tcp_output(tp);
+ error = tp->t_fb->tcp_output(tp);
out:
TCPDEBUG2(PRU_SHUTDOWN);
@@ -809,7 +810,7 @@
tcp_offload_rcvd(tp);
else
#endif
- tcp_output(tp);
+ tp->t_fb->tcp_output(tp);
out:
TCPDEBUG2(PRU_RCVD);
@@ -911,7 +912,7 @@
!(flags & PRUS_NOTREADY)) {
if (flags & PRUS_MORETOCOME)
tp->t_flags |= TF_MORETOCOME;
- error = tcp_output(tp);
+ error = tp->t_fb->tcp_output(tp);
if (flags & PRUS_MORETOCOME)
tp->t_flags &= ~TF_MORETOCOME;
}
@@ -961,7 +962,7 @@
tp->snd_up = tp->snd_una + sbavail(&so->so_snd);
if (!(flags & PRUS_NOTREADY)) {
tp->t_flags |= TF_FORCEDATA;
- error = tcp_output(tp);
+ error = tp->t_fb->tcp_output(tp);
tp->t_flags &= ~TF_FORCEDATA;
}
}
@@ -997,7 +998,7 @@
error = sbready(&so->so_snd, m, count);
SOCKBUF_UNLOCK(&so->so_snd);
if (error == 0)
- error = tcp_output(tp);
+ error = tp->t_fb->tcp_output(tp);
INP_WUNLOCK(inp);
return (error);
@@ -1349,13 +1350,9 @@
int
tcp_ctloutput(struct socket *so, struct sockopt *sopt)
{
- int error, opt, optval;
- u_int ui;
+ int error;
struct inpcb *inp;
struct tcpcb *tp;
- struct tcp_info ti;
- char buf[TCP_CA_NAME_MAX];
- struct cc_algo *algo;
error = 0;
inp = sotoinpcb(so);
@@ -1383,7 +1380,28 @@
INP_WUNLOCK(inp);
return (ECONNRESET);
}
+ tp = intotcpcb(inp);
+ if (tp->t_fb->tcp_ctloutput) {
+ /* Pass in the INP locked, called must unlock it */
+ return (tp->t_fb->tcp_ctloutput(so, sopt, inp, tp));
+ } else {
+ INP_WUNLOCK(inp);
+ return(ENOENT);
+ }
+}
+int
+tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp)
+{
+ int error, opt, optval;
+ u_int ui;
+ struct tcp_info ti;
+ struct tcp_function_set fsn;
+ struct tcp_function_block *blk;
+ char buf[TCP_CA_NAME_MAX];
+ struct cc_algo *algo;
+
+
switch (sopt->sopt_dir) {
case SOPT_SET:
switch (sopt->sopt_name) {
@@ -1451,7 +1469,7 @@
else if (tp->t_flags & TF_NOPUSH) {
tp->t_flags &= ~TF_NOPUSH;
if (TCPS_HAVEESTABLISHED(tp->t_state))
- error = tcp_output(tp);
+ error = tp->t_fb->tcp_output(tp);
}
goto unlock_and_done;
@@ -1546,7 +1564,7 @@
*/
if ((tp->t_state > TCPS_LISTEN) &&
(tp->t_state <= TCPS_CLOSING))
- tcp_timer_activate(tp, TT_KEEP,
+ tp->t_fb->tcp_timer_activate(tp, TT_KEEP,
TP_KEEPIDLE(tp));
break;
case TCP_KEEPINTVL:
@@ -1553,7 +1571,7 @@
tp->t_keepintvl = ui;
if ((tp->t_state == TCPS_FIN_WAIT_2) &&
(TP_MAXIDLE(tp) > 0))
- tcp_timer_activate(tp, TT_2MSL,
+ tp->t_fb->tcp_timer_activate(tp, TT_2MSL,
TP_MAXIDLE(tp));
break;
case TCP_KEEPINIT:
@@ -1560,7 +1578,7 @@
tp->t_keepinit = ui;
if (tp->t_state == TCPS_SYN_RECEIVED ||
tp->t_state == TCPS_SYN_SENT)
- tcp_timer_activate(tp, TT_KEEP,
+ tp->t_fb->tcp_timer_activate(tp, TT_KEEP,
TP_KEEPINIT(tp));
break;
}
@@ -1576,7 +1594,7 @@
tp->t_keepcnt = ui;
if ((tp->t_state == TCPS_FIN_WAIT_2) &&
(TP_MAXIDLE(tp) > 0))
- tcp_timer_activate(tp, TT_2MSL,
+ tp->t_fb->tcp_timer_activate(tp, TT_2MSL,
TP_MAXIDLE(tp));
goto unlock_and_done;
@@ -1599,6 +1617,43 @@
goto unlock_and_done;
#endif
+ case TCP_FUNCTION_BLK:
+ INP_WUNLOCK(inp);
+ error = sooptcopyin(sopt, &fsn, sizeof fsn,
+ sizeof fsn);
+ if (error)
+ return (error);
+ INP_WLOCK_RECHECK(inp);
+ if (tp->t_state != TCPS_CLOSED) {
+
+ /*
+ * The user has advanced the state
+ * past the initial point, we can't
+ * switch since we are down the road
+ * and a new set of functions may
+ * not be compatibile.
+ */
+ INP_WUNLOCK(inp);
+ return(EINVAL);
+ }
+ blk = find_tcp_functions(&fsn);
+ if (blk == NULL) {
+ INP_WUNLOCK(inp);
+ return (ENOENT);
+ }
+ /*
+ * Release the old refcnt, the
+ * lookup acquires a ref on the
+ * new one.
+ */
+ if (tp->t_fb->tcp_fb_fini)
+ (*tp->t_fb->tcp_fb_fini)(tp);
+ refcount_release(&tp->t_fb->refcnt);
+ tp->t_fb = blk;
+ if (tp->t_fb->tcp_fb_init) {
+ (*tp->t_fb->tcp_fb_init)(tp);
+ }
+ goto unlock_and_done;
default:
INP_WUNLOCK(inp);
error = ENOPROTOOPT;
@@ -1678,6 +1733,12 @@
error = sooptcopyout(sopt, &optval, sizeof optval);
break;
#endif
+ case TCP_FUNCTION_BLK:
+ strcpy(fsn.function_set_name, tp->t_fb->tcp_block_name);
+ fsn.pcbcnt = tp->t_fb->refcnt;
+ INP_WUNLOCK(inp);
+ error = sooptcopyout(sopt, &fsn, sizeof fsn);
+ break;
default:
INP_WUNLOCK(inp);
error = ENOPROTOOPT;
@@ -1770,7 +1831,7 @@
sbflush(&so->so_rcv);
tcp_usrclosed(tp);
if (!(inp->inp_flags & INP_DROPPED))
- tcp_output(tp);
+ tp->t_fb->tcp_output(tp);
}
}
@@ -1829,7 +1890,7 @@
timeout = (tcp_fast_finwait2_recycle) ?
tcp_finwait2_timeout : TP_MAXIDLE(tp);
- tcp_timer_activate(tp, TT_2MSL, timeout);
+ tp->t_fb->tcp_timer_activate(tp, TT_2MSL, timeout);
}
}
}
Index: sys/netinet/tcp_var.h
===================================================================
--- sys/netinet/tcp_var.h
+++ sys/netinet/tcp_var.h
@@ -88,6 +88,41 @@
#define tcp6cb tcpcb /* for KAME src sync over BSD*'s */
+/*
+ * TODO: We yet need to brave plowing in
+ * to tcp_input() and the pru_usrreq() block.
+ * Right now these go to the old standards which
+ * are somewhat ok, but in the long term may
+ * need to be changed. If we do tackle tcp_input()
+ * then we need to get rid of the tcp_do_segment()
+ * function below.
+ */
+
+struct tcp_function_block {
+ char tcp_block_name[32];
+ void (*tcp_timer_activate)(struct tcpcb *,
+ uint32_t, u_int);
+ int (*tcp_timer_active)(struct tcpcb *, uint32_t);
+ void (*tcp_timer_stop)(struct tcpcb *, uint32_t);
+ int (*tcp_output)(struct tcpcb *);
+ void (*tcp_do_segment)(struct mbuf *, struct tcphdr *,
+ struct socket *, struct tcpcb *,
+ int, int, uint8_t,
+ int);
+ int (*tcp_ctloutput)(struct socket *so, struct sockopt *sopt,
+ struct inpcb *inp, struct tcpcb *tp);
+ void (*tcp_fb_init)(struct tcpcb *); /* Can allocate memory in t_fb_ptr */
+ void (*tcp_fb_fini)(struct tcpcb *); /* Finish should free memory in t_fb_ptr */
+ volatile uint32_t refcnt;
+};
+
+struct tcp_function {
+ TAILQ_ENTRY(tcp_function) next;
+ struct tcp_function_block *fb;
+};
+
+TAILQ_HEAD(tcp_funchead, tcp_function);
+
/*
* Tcp control block, one per tcp; fields:
* Organized for 16 byte cacheline efficiency.
@@ -206,9 +241,10 @@
u_int t_tsomaxsegsize; /* TSO maximum segment size in bytes */
u_int t_pmtud_saved_maxopd; /* pre-blackhole MSS */
u_int t_flags2; /* More tcpcb flags storage */
-
uint32_t t_ispare[8]; /* 5 UTO, 3 TBD */
- void *t_pspare2[4]; /* 1 TCP_SIGNATURE, 3 TBD */
+ struct tcp_function_block *t_fb;/* TCP function call block */
+ void *t_fb_ptr; /* Pointer to t_fb specific data */
+ void *t_pspare2[2]; /* 1 TCP_SIGNATURE, 1 TBD */
#if defined(_KERNEL) && defined(TCPPCAP)
struct mbufq t_inpkts; /* List of saved input packets. */
struct mbufq t_outpkts; /* List of saved output packets. */
@@ -534,6 +570,8 @@
#define tcps_rcvmemdrop tcps_rcvreassfull /* compat */
#ifdef _KERNEL
+#define TI_UNLOCKED 1
+#define TI_RLOCKED 2
#include <sys/counter.h>
VNET_PCPUSTAT_DECLARE(struct tcpstat, tcpstat); /* tcp statistics */
@@ -685,6 +723,16 @@
void tcp_reass_global_init(void);
void tcp_reass_flush(struct tcpcb *);
int tcp_input(struct mbuf **, int *, int);
+void tcp_do_segment(struct mbuf *, struct tcphdr *,
+ struct socket *, struct tcpcb *, int, int, uint8_t,
+ int);
+
+int register_tcp_functions(struct tcp_function_block *blk, int wait);
+int deregister_tcp_functions(struct tcp_function_block *blk);
+struct tcp_function_block *find_tcp_functions(struct tcp_function_set *fs);
+struct tcp_function_block *find_and_ref_tcp_fb(struct tcp_function_block *blk);
+int tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp);
+
u_long tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *);
u_long tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *);
void tcp_mss_update(struct tcpcb *, int, int, struct hc_metrics_lite *,
Index: sys/netinet/toecore.c
===================================================================
--- sys/netinet/toecore.c
+++ sys/netinet/toecore.c
@@ -508,8 +508,8 @@
tod->tod_pcb_detach(tod, tp);
KASSERT(!(tp->t_flags & TF_TOE),
("%s: tp %p still offloaded.", __func__, tp));
- tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
- (void) tcp_output(tp);
+ tp->t_fb->tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
+ (void) tp->t_fb->tcp_output(tp);
} else {
INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
Index: usr.sbin/Makefile
===================================================================
--- usr.sbin/Makefile
+++ usr.sbin/Makefile
@@ -84,6 +84,7 @@
spray \
syslogd \
sysrc \
+ tcp_function_ctrl \
tcpdrop \
tcpdump \
traceroute \
Index: usr.sbin/tcp_function_ctrl/Makefile
===================================================================
--- usr.sbin/tcp_function_ctrl/Makefile
+++ usr.sbin/tcp_function_ctrl/Makefile
@@ -0,0 +1,12 @@
+# @(#)Makefile 8.1 (Berkeley) 6/9/93
+# $FreeBSD$
+
+PROG= tcp_function_ctrl
+MAN= tcp_function_ctrl.8
+SRCS= tcp_function_ctrl.c
+CFLAGS+= -Wall -Werror
+
+BINDIR= /usr/bin
+
+.include <bsd.prog.mk>
+
Index: usr.sbin/tcp_function_ctrl/tcp_function_ctrl.8
===================================================================
--- usr.sbin/tcp_function_ctrl/tcp_function_ctrl.8
+++ usr.sbin/tcp_function_ctrl/tcp_function_ctrl.8
@@ -0,0 +1,75 @@
+.\" Copyright (c) 2015
+.\" Netflix Inc.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd Oct 31, 2015
+.Dt TCP_FUNCTION_CTRL 8
+.Os
+.Sh NAME
+.Nm tcp_function_control
+.Nd Control and list TCP function sets
+.Sh SYNOPSIS
+.Nm
+.Fl s Ar name
+.Nm
+.Fl l
+.Nm
+.Fl d
+.Sh DESCRIPTION
+The
+.Nm
+program is designed to help you list and control what TCP functions
+are available and in use on your system. The system has a default
+function set that contain the standard FreeBSD TCP functions. Users
+may load kernel loadable modules which add different TCP behaviors
+through new functionality. Each set of functionality is named by
+the designer. Once the module is installed it makes available new
+functionality. You may use the
+.Nm
+progam to list, enquire which function set is the default, or set
+a new default function set. The default function set is the one
+that any new TCP connection will use. Users may also use the
+TCP_FUNCTION_BLK socket option to change a specific TCP socket
+at startup.
+.Sh OPTIONS
+The following options are available:
+.Bl -tag -width indent
+.It Fl l
+List out the current TCP function sets installed on this machine and
+give an account of the number of PCB using it.
+.It Fl d
+List the default TCP function set that will be used if no socket
+option is specified by the program.
+.It Fl s Ar name
+Set the default TCP function set to name.
+.Sh SEE ALSO
+.Xr tcp 4
+.Sh HISTORY
+The
+.Nm
+utility first appeared in
+.Fx 11.0.
+.Sh AUTHORS
+.An Randall Stewart Aq Mt rrs@FreeBSD.org
Index: usr.sbin/tcp_function_ctrl/tcp_function_ctrl.c
===================================================================
--- usr.sbin/tcp_function_ctrl/tcp_function_ctrl.c
+++ usr.sbin/tcp_function_ctrl/tcp_function_ctrl.c
@@ -0,0 +1,112 @@
+#include <sys/types.h>
+#include <stdio.h>
+#include <sys/socket.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <strings.h>
+#include <sys/sysctl.h>
+#include <sys/errno.h>
+#include <netinet/tcp.h>
+#include <getopt.h>
+
+
+int
+main(int argc, char **argv)
+{
+ struct tcp_function_set *set;
+ struct tcp_function_set def;
+ size_t sz;
+ int num, i, cnt;
+ char *name=NULL;
+ int show_default = 0;
+ int list_functions = 0;
+
+ while((i = getopt(argc, argv, "s:ld?h")) != -1) {
+ switch(i) {
+ case 's':
+ name = optarg;
+ if (strlen(name) >= 32) {
+ printf("Name to large\n");
+ goto use;
+ }
+ break;
+ case 'd':
+ show_default = 1;
+ break;
+ case 'l':
+ list_functions = 1;
+ break;
+ case 'h':
+ case '?':
+ use:
+ printf("Use %s -s name <or> -l <or> -d\n", argv[0]);
+ return(0);
+ break;
+ }
+ }
+ if ((show_default == 0) && (name == NULL) && (list_functions == 0)) {
+ goto use;
+ }
+ if (show_default) {
+ sz = sizeof(def);
+ if (sysctlbyname("net.inet.tcp.setfunctions", &def, &sz,
+ NULL, 0)) {
+ printf("Cant get TCP default function name error:%d\n", errno);
+ exit(-1);
+ }
+ printf("The TCP default functions is named:%s with %d pcbs in use\n",
+ def.function_set_name,
+ def.pcbcnt);
+ return(0);
+ }
+ if (list_functions) {
+ sz = 0;
+ if (sysctlbyname("net.inet.tcp.listfunctions", NULL, &sz,
+ NULL, 0)) {
+ printf("At copy out to get sizes ... err:%d\n", errno);
+ exit(-1);
+ }
+ num = (sz/sizeof(struct tcp_function_set)) + 1;
+ sz = sizeof(struct tcp_function_set) * num;
+ set = malloc(sz);
+ if (set == NULL) {
+ printf("Malloc failed to get interim memory for list err:%d\n",
+ errno);
+ return(-1);
+ }
+ memset(set, 0, sz);
+ if (sysctlbyname("net.inet.tcp.listfunctions", set, &sz,
+ NULL, 0)) {
+ printf("Can't get the list from the kernel error:%d\n", errno);
+ exit(-1);
+ }
+ num = sz/sizeof(struct tcp_function_set);
+ printf("The Following TCP function sets are available\n");
+ for(i=0; i<num; i++) {
+ cnt = printf("TCP Stack:%s",
+ set[i].function_set_name);
+ if (cnt < 32) {
+ for(; cnt<32; cnt++) {
+ printf(" ");
+ }
+ }
+ printf("Number-of-TCB's:%d\n",
+ set[i].pcbcnt);
+ }
+ return(0);
+ }
+ if (name != NULL) {
+ memset(&def, 0, sizeof(def));
+ strcpy(def.function_set_name, name);
+ sz = sizeof(def);
+ if (sysctlbyname("net.inet.tcp.setfunctions",
+ NULL, 0, &def, sz)) {
+ printf("Can't set in %s as the new default error:%d\n",
+ name,
+ errno);
+ exit(-1);
+ }
+ }
+ return(0);
+}

File Metadata

Mime Type
text/plain
Expires
Sun, Feb 23, 1:08 AM (47 m, 25 s)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
16784485
Default Alt Text
D4055.id9858.diff (120 KB)

Event Timeline