Index: sys/modules/Makefile =================================================================== --- sys/modules/Makefile +++ sys/modules/Makefile @@ -343,6 +343,7 @@ ${_syscons} \ sysvipc \ ${_ti} \ + tcp/fastpath \ tests/framework \ tests/callout_test \ tl \ Index: sys/modules/tcp/fastpath/Makefile =================================================================== --- sys/modules/tcp/fastpath/Makefile +++ sys/modules/tcp/fastpath/Makefile @@ -0,0 +1,15 @@ +# +# $FreeBSD$ +# + +.PATH: ${.CURDIR}/../../../netinet/tcp_stacks + +KMOD= fastpath +SRCS= fastpath.c + +# +# Enable full debugging +# +#CFLAGS += -g + +.include Index: sys/netinet/tcp.h =================================================================== --- sys/netinet/tcp.h +++ sys/netinet/tcp.h @@ -167,7 +167,7 @@ #define TCP_KEEPCNT 1024 /* L,N number of keepalives before close */ #define TCP_PCAP_OUT 2048 /* number of output packets to keep */ #define TCP_PCAP_IN 4096 /* number of input packets to keep */ - +#define TCP_FUNCTION_BLK 8192 /* Set the tcp function pointers to the specified stack */ /* Start of reserved space for third-party user-settable options. */ #define TCP_VENDOR SO_VENDOR @@ -245,5 +245,11 @@ u_int32_t __tcpi_pad[26]; /* Padding. */ }; #endif +#define TCP_FUNCTION_NAME_LEN_MAX 32 +struct tcp_function_set { + char function_set_name[TCP_FUNCTION_NAME_LEN_MAX]; + uint32_t pcbcnt; +}; + #endif /* !_NETINET_TCP_H_ */ Index: sys/netinet/tcp_input.c =================================================================== --- sys/netinet/tcp_input.c +++ sys/netinet/tcp_input.c @@ -230,23 +230,6 @@ #define tcb6 tcb /* for KAME src sync over BSD*'s */ VNET_DEFINE(struct inpcbinfo, tcbinfo); -static void tcp_dooptions(struct tcpopt *, u_char *, int, int); -static void tcp_do_segment(struct mbuf *, struct tcphdr *, - struct socket *, struct tcpcb *, int, int, uint8_t, - int); -static void tcp_dropwithreset(struct mbuf *, struct tcphdr *, - struct tcpcb *, int, int); -static void tcp_pulloutofband(struct socket *, - struct tcphdr *, struct mbuf *, int); -static void tcp_xmit_timer(struct tcpcb *, int); -static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); -static void inline cc_ack_received(struct tcpcb *tp, struct tcphdr *th, - uint16_t type); -static void inline cc_conn_init(struct tcpcb *tp); -static void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th); -static void inline hhook_run_tcp_est_in(struct tcpcb *tp, - struct tcphdr *th, struct tcpopt *to); - /* * TCP statistics are stored in an "array" of counter(9)s. */ @@ -272,7 +255,7 @@ /* * Wrapper for the TCP established input helper hook. */ -static void inline +void hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to) { struct tcp_hhook_data hhook_data; @@ -290,7 +273,7 @@ /* * CC wrapper hook functions */ -static void inline +void cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type) { INP_WLOCK_ASSERT(tp->t_inpcb); @@ -322,7 +305,7 @@ } } -static void inline +void cc_conn_init(struct tcpcb *tp) { struct hc_metrics_lite metrics; @@ -446,7 +429,7 @@ } } -static void inline +void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th) { INP_WLOCK_ASSERT(tp->t_inpcb); @@ -601,9 +584,6 @@ struct tcpopt to; /* options in this segment */ char *s = NULL; /* address and port logging */ int ti_locked; -#define TI_UNLOCKED 1 -#define TI_RLOCKED 2 - #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, @@ -1175,7 +1155,7 @@ * contains. tcp_do_segment() consumes * the mbuf chain and unlocks the inpcb. */ - tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, + tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); return (IPPROTO_DONE); @@ -1421,7 +1401,7 @@ * state. tcp_do_segment() always consumes the mbuf chain, unlocks * the inpcb, and unlocks pcbinfo. */ - tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked); + tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); return (IPPROTO_DONE); @@ -1476,7 +1456,7 @@ return (IPPROTO_DONE); } -static void +void tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, int ti_locked) @@ -1787,7 +1767,7 @@ tp->t_rxtcur); sowwakeup(so); if (sbavail(&so->so_snd)) - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); goto check_delack; } } else if (th->th_ack == tp->snd_una && @@ -1906,7 +1886,7 @@ tp->t_flags |= TF_DELACK; } else { tp->t_flags |= TF_ACKNOW; - tcp_output(tp); + tp->t_fb->tfb_tcp_output(tp); } goto check_delack; } @@ -2507,7 +2487,7 @@ } } else tp->snd_cwnd += tp->t_maxseg; - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); goto drop; } else if (tp->t_dupacks == tcprexmtthresh) { tcp_seq onxt = tp->snd_nxt; @@ -2541,12 +2521,12 @@ tcps_sack_recovery_episode); tp->sack_newdata = tp->snd_nxt; tp->snd_cwnd = tp->t_maxseg; - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); goto drop; } tp->snd_nxt = th->th_ack; tp->snd_cwnd = tp->t_maxseg; - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); KASSERT(tp->snd_limited <= 2, ("%s: tp->snd_limited too big", __func__)); @@ -2593,7 +2573,7 @@ (tp->snd_nxt - tp->snd_una); SOCKBUF_UNLOCK(&so->so_snd); if (avail > 0) - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); sent = tp->snd_max - oldsndmax; if (sent > tp->t_maxseg) { KASSERT((tp->t_dupacks == 2 && @@ -3049,7 +3029,7 @@ * Return any desired output. */ if (needoutput || (tp->t_flags & TF_ACKNOW)) - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); check_delack: KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", @@ -3097,7 +3077,7 @@ ti_locked = TI_UNLOCKED; tp->t_flags |= TF_ACKNOW; - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); INP_WUNLOCK(tp->t_inpcb); m_freem(m); return; @@ -3143,7 +3123,7 @@ * The mbuf must still include the original packet header. * tp may be NULL. */ -static void +void tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int tlen, int rstreason) { @@ -3206,7 +3186,7 @@ /* * Parse TCP options and place in tcpopt. */ -static void +void tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) { int opt, optlen; @@ -3300,7 +3280,7 @@ * It is still reflected in the segment length for * sequencing purposes. */ -static void +void tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, int off) { @@ -3333,7 +3313,7 @@ * Collect new round-trip time estimate * and update averages and current timeout. */ -static void +void tcp_xmit_timer(struct tcpcb *tp, int rtt) { int delta; @@ -3713,7 +3693,7 @@ * By setting snd_nxt to ti_ack, this forces retransmission timer to * be started again. */ -static void +void tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) { tcp_seq onxt = tp->snd_nxt; @@ -3730,7 +3710,7 @@ */ tp->snd_cwnd = tp->t_maxseg + BYTES_THIS_ACK(tp, th); tp->t_flags |= TF_ACKNOW; - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); tp->snd_cwnd = ocwnd; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; Index: sys/netinet/tcp_sack.c =================================================================== --- sys/netinet/tcp_sack.c +++ sys/netinet/tcp_sack.c @@ -589,7 +589,7 @@ if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; tp->t_flags |= TF_ACKNOW; - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); } #if 0 Index: sys/netinet/tcp_stacks/fastpath.c =================================================================== --- sys/netinet/tcp_stacks/fastpath.c +++ sys/netinet/tcp_stacks/fastpath.c @@ -0,0 +1,2486 @@ +/*- + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * Copyright (c) 2007-2008,2010 + * Swinburne University of Technology, Melbourne, Australia. + * Copyright (c) 2009-2010 Lawrence Stewart + * Copyright (c) 2010 The FreeBSD Foundation + * Copyright (c) 2010-2011 Juniper Networks, Inc. + * Copyright (c) 2015 Netflix Inc. + * All rights reserved. + * + * Portions of this software were developed at the Centre for Advanced Internet + * Architectures, Swinburne University of Technology, by Lawrence Stewart, + * James Healy and David Hayes, made possible in part by a grant from the Cisco + * University Research Program Fund at Community Foundation Silicon Valley. + * + * Portions of this software were developed at the Centre for Advanced + * Internet Architectures, Swinburne University of Technology, Melbourne, + * Australia by David Hayes under sponsorship from the FreeBSD Foundation. + * + * Portions of this software were developed by Robert N. M. Watson under + * contract to Juniper Networks, Inc. + * + * Portions of this software were developed by Randall R. Stewart while + * working for Netflix Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_ipfw.h" /* for ipfw_fwd */ +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_ipsec.h" +#include "opt_kdtrace.h" +#include "opt_tcpdebug.h" + +#include +#include +#include +#include +#include +#include +#include /* for proc0 declaration */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include /* before tcp_seq.h, for tcp_random18() */ + +#include + +#include +#include +#include +#include + +#define TCPSTATES /* for logging */ + +#include +#include +#include +#include +#include +#include +#include +#include /* required for icmp_var.h */ +#include /* for ICMP_BANDLIM */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef TCPDEBUG +#include +#endif /* TCPDEBUG */ +#ifdef TCP_OFFLOAD +#include +#endif + +#ifdef IPSEC +#include +#include +#endif /*IPSEC*/ + +#include + +#include + +const int tcprexmtthresh; + +VNET_DECLARE(int, tcp_autorcvbuf_inc); +#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) +VNET_DECLARE(int, tcp_autorcvbuf_max); +#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) +VNET_DECLARE(int, tcp_do_rfc3042); +#define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042) +VNET_DECLARE(int, tcp_do_autorcvbuf); +#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) +VNET_DECLARE(int, tcp_insecure_rst); +#define V_tcp_insecure_rst VNET(tcp_insecure_rst) +VNET_DECLARE(int, tcp_insecure_syn); +#define V_tcp_insecure_syn VNET(tcp_insecure_syn) + + + + +extern void tcp_dooptions(struct tcpopt *, u_char *, int, int); +extern void tcp_dropwithreset(struct mbuf *, struct tcphdr *, + struct tcpcb *, int, int); +extern void tcp_pulloutofband(struct socket *, + struct tcphdr *, struct mbuf *, int); +extern void tcp_xmit_timer(struct tcpcb *, int); +extern void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); +extern void tcp_mss(struct tcpcb *tp, int offer); +extern void cc_ack_received(struct tcpcb *tp, struct tcphdr *th, + uint16_t type); +extern void cc_conn_init(struct tcpcb *tp); +extern void cc_post_recovery(struct tcpcb *tp, struct tcphdr *th); +extern void cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type); +extern void hhook_run_tcp_est_in(struct tcpcb *tp, + struct tcphdr *th, struct tcpopt *to); + +extern void kmod_tcpstat_inc(int statnum); +#ifdef TCP_SIGNATURE +extern int tcp_signature_verify_input(struct mbuf *m, int off0, int tlen, int optlen, + struct tcpopt *to, struct tcphdr *th, u_int tcpbflag); +#endif + +static void tcp_do_segment_fastslow(struct mbuf *, struct tcphdr *, + struct socket *, struct tcpcb *, int, int, uint8_t, + int); + +static void tcp_do_segment_fastack(struct mbuf *, struct tcphdr *, + struct socket *, struct tcpcb *, int, int, uint8_t, + int); + +/* + * Indicate whether this ack should be delayed. We can delay the ack if + * following conditions are met: + * - There is no delayed ack timer in progress. + * - Our last ack wasn't a 0-sized window. We never want to delay + * the ack that opens up a 0-sized window. + * - LRO wasn't used for this segment. We make sure by checking that the + * segment size is not larger than the MSS. + * - Delayed acks are enabled or this is a half-synchronized T/TCP + * connection. + */ +#define DELAY_ACK(tp, tlen) \ + ((!tcp_timer_active(tp, TT_DELACK) && \ + (tp->t_flags & TF_RXWIN0SENT) == 0) && \ + (tlen <= tp->t_maxopd) && \ + (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) + +/* + * So how is this faster than the normal fast ack? + * It basically allows us to also stay in the fastpath + * when a window-update ack also arrives. In testing + * we saw only 25-30% of connections doing fastpath + * due to the fact that along with moving forward + * in sequence the window was also updated. + */ +static void +tcp_do_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, + int ti_locked, u_long tiwin) +{ + int acked; + int winup_only=0; +#ifdef TCPDEBUG + /* + * The size of tcp_saveipgen must be the size of the max ip header, + * now IPv6. + */ + u_char tcp_saveipgen[IP6_HDR_LEN]; + struct tcphdr tcp_savetcp; + short ostate = 0; +#endif + /* + * The following if statment will be true if + * we are doing the win_up_in_fp + * - We have more new data (SEQ_LT(tp->snd_wl1, th->th_seq)) + * - No more new data, but we have an ack for new data + * (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack)) + * - No more new data, the same ack point but the window grew + * (tp->snd_wl1 == th->th_seq && tp->snd_wl2 == th->th_ack && twin > tp->snd_wnd) + */ + if ((SEQ_LT(tp->snd_wl1, th->th_seq) || + (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || + (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { + /* keep track of pure window updates */ + if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) { + winup_only = 1; + TCPSTAT_INC(tcps_rcvwinupd); + } + tp->snd_wnd = tiwin; + tp->snd_wl1 = th->th_seq; + tp->snd_wl2 = th->th_ack; + if (tp->snd_wnd > tp->max_sndwnd) + tp->max_sndwnd = tp->snd_wnd; + } + /* + * If last ACK falls within this segment's sequence numbers, + * record the timestamp. + * NOTE that the test is modified according to the latest + * proposal of the tcplw@cray.com list (Braden 1993/04/26). + */ + if ((to->to_flags & TOF_TS) != 0 && + SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { + tp->ts_recent_age = tcp_ts_getticks(); + tp->ts_recent = to->to_tsval; + } + /* + * This is a pure ack for outstanding data. + */ + if (ti_locked == TI_RLOCKED) { + INP_INFO_RUNLOCK(&V_tcbinfo); + } + ti_locked = TI_UNLOCKED; + + TCPSTAT_INC(tcps_predack); + + /* + * "bad retransmit" recovery. + */ + if (tp->t_rxtshift == 1 && + tp->t_flags & TF_PREVVALID && + (int)(ticks - tp->t_badrxtwin) < 0) { + cc_cong_signal(tp, th, CC_RTO_ERR); + } + + /* + * Recalculate the transmit timer / rtt. + * + * Some boxes send broken timestamp replies + * during the SYN+ACK phase, ignore + * timestamps of 0 or we could calculate a + * huge RTT and blow up the retransmit timer. + */ + if ((to->to_flags & TOF_TS) != 0 && + to->to_tsecr) { + u_int t; + + t = tcp_ts_getticks() - to->to_tsecr; + if (!tp->t_rttlow || tp->t_rttlow > t) + tp->t_rttlow = t; + tcp_xmit_timer(tp, + TCP_TS_TO_TICKS(t) + 1); + } else if (tp->t_rtttime && + SEQ_GT(th->th_ack, tp->t_rtseq)) { + if (!tp->t_rttlow || + tp->t_rttlow > ticks - tp->t_rtttime) + tp->t_rttlow = ticks - tp->t_rtttime; + tcp_xmit_timer(tp, + ticks - tp->t_rtttime); + } + if (winup_only == 0) { + acked = BYTES_THIS_ACK(tp, th); + + /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ + hhook_run_tcp_est_in(tp, th, to); + + TCPSTAT_ADD(tcps_rcvackbyte, acked); + sbdrop(&so->so_snd, acked); + if (SEQ_GT(tp->snd_una, tp->snd_recover) && + SEQ_LEQ(th->th_ack, tp->snd_recover)) + tp->snd_recover = th->th_ack - 1; + + /* + * Let the congestion control algorithm update + * congestion control related information. This + * typically means increasing the congestion + * window. + */ + cc_ack_received(tp, th, CC_ACK); + + tp->snd_una = th->th_ack; + /* + * Pull snd_wl2 up to prevent seq wrap relative + * to th_ack. + */ + tp->snd_wl2 = th->th_ack; + tp->t_dupacks = 0; + m_freem(m); + + /* + * If all outstanding data are acked, stop + * retransmit timer, otherwise restart timer + * using current (possibly backed-off) value. + * If process is waiting for space, + * wakeup/selwakeup/signal. If data + * are ready to send, let tcp_output + * decide between more output or persist. + */ +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) + tcp_trace(TA_INPUT, ostate, tp, + (void *)tcp_saveipgen, + &tcp_savetcp, 0); +#endif + if (tp->snd_una == tp->snd_max) + tcp_timer_activate(tp, TT_REXMT, 0); + else if (!tcp_timer_active(tp, TT_PERSIST)) + tcp_timer_activate(tp, TT_REXMT, + tp->t_rxtcur); + } else { + /* + * Window update only, just free the mbufs and + * send out whatever we can. + */ + m_freem(m); + } + sowwakeup(so); + if (sbavail(&so->so_snd)) + (void) tcp_output(tp); + KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", + __func__, ti_locked)); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + INP_WLOCK_ASSERT(tp->t_inpcb); + + if (tp->t_flags & TF_DELACK) { + tp->t_flags &= ~TF_DELACK; + tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); + } + INP_WUNLOCK(tp->t_inpcb); +} + +/* + * Here nothing is really faster, its just that we + * have broken out the fast-data path also just like + * the fast-ack. + */ +static void +tcp_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, + int ti_locked, u_long tiwin) +{ + int newsize = 0; /* automatic sockbuf scaling */ +#ifdef TCPDEBUG + /* + * The size of tcp_saveipgen must be the size of the max ip header, + * now IPv6. + */ + u_char tcp_saveipgen[IP6_HDR_LEN]; + struct tcphdr tcp_savetcp; + short ostate = 0; +#endif + /* + * If last ACK falls within this segment's sequence numbers, + * record the timestamp. + * NOTE that the test is modified according to the latest + * proposal of the tcplw@cray.com list (Braden 1993/04/26). + */ + if ((to->to_flags & TOF_TS) != 0 && + SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { + tp->ts_recent_age = tcp_ts_getticks(); + tp->ts_recent = to->to_tsval; + } + + /* + * This is a pure, in-sequence data packet with + * nothing on the reassembly queue and we have enough + * buffer space to take it. + */ + if (ti_locked == TI_RLOCKED) { + INP_INFO_RUNLOCK(&V_tcbinfo); + } + ti_locked = TI_UNLOCKED; + + /* Clean receiver SACK report if present */ + if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) + tcp_clean_sackreport(tp); + TCPSTAT_INC(tcps_preddat); + tp->rcv_nxt += tlen; + /* + * Pull snd_wl1 up to prevent seq wrap relative to + * th_seq. + */ + tp->snd_wl1 = th->th_seq; + /* + * Pull rcv_up up to prevent seq wrap relative to + * rcv_nxt. + */ + tp->rcv_up = tp->rcv_nxt; + TCPSTAT_ADD(tcps_rcvbyte, tlen); +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) + tcp_trace(TA_INPUT, ostate, tp, + (void *)tcp_saveipgen, &tcp_savetcp, 0); +#endif + /* + * Automatic sizing of receive socket buffer. Often the send + * buffer size is not optimally adjusted to the actual network + * conditions at hand (delay bandwidth product). Setting the + * buffer size too small limits throughput on links with high + * bandwidth and high delay (eg. trans-continental/oceanic links). + * + * On the receive side the socket buffer memory is only rarely + * used to any significant extent. This allows us to be much + * more aggressive in scaling the receive socket buffer. For + * the case that the buffer space is actually used to a large + * extent and we run out of kernel memory we can simply drop + * the new segments; TCP on the sender will just retransmit it + * later. Setting the buffer size too big may only consume too + * much kernel memory if the application doesn't read() from + * the socket or packet loss or reordering makes use of the + * reassembly queue. + * + * The criteria to step up the receive buffer one notch are: + * 1. Application has not set receive buffer size with + * SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE. + * 2. the number of bytes received during the time it takes + * one timestamp to be reflected back to us (the RTT); + * 3. received bytes per RTT is within seven eighth of the + * current socket buffer size; + * 4. receive buffer size has not hit maximal automatic size; + * + * This algorithm does one step per RTT at most and only if + * we receive a bulk stream w/o packet losses or reorderings. + * Shrinking the buffer during idle times is not necessary as + * it doesn't consume any memory when idle. + * + * TODO: Only step up if the application is actually serving + * the buffer to better manage the socket buffer resources. + */ + if (V_tcp_do_autorcvbuf && + (to->to_flags & TOF_TS) && + to->to_tsecr && + (so->so_rcv.sb_flags & SB_AUTOSIZE)) { + if (TSTMP_GT(to->to_tsecr, tp->rfbuf_ts) && + to->to_tsecr - tp->rfbuf_ts < hz) { + if (tp->rfbuf_cnt > + (so->so_rcv.sb_hiwat / 8 * 7) && + so->so_rcv.sb_hiwat < + V_tcp_autorcvbuf_max) { + newsize = + min(so->so_rcv.sb_hiwat + + V_tcp_autorcvbuf_inc, + V_tcp_autorcvbuf_max); + } + /* Start over with next RTT. */ + tp->rfbuf_ts = 0; + tp->rfbuf_cnt = 0; + } else + tp->rfbuf_cnt += tlen; /* add up */ + } + + /* Add data to socket buffer. */ + SOCKBUF_LOCK(&so->so_rcv); + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { + m_freem(m); + } else { + /* + * Set new socket buffer size. + * Give up when limit is reached. + */ + if (newsize) + if (!sbreserve_locked(&so->so_rcv, + newsize, so, NULL)) + so->so_rcv.sb_flags &= ~SB_AUTOSIZE; + m_adj(m, drop_hdrlen); /* delayed header drop */ + sbappendstream_locked(&so->so_rcv, m, 0); + } + /* NB: sorwakeup_locked() does an implicit unlock. */ + sorwakeup_locked(so); + if (DELAY_ACK(tp, tlen)) { + tp->t_flags |= TF_DELACK; + } else { + tp->t_flags |= TF_ACKNOW; + tcp_output(tp); + } + KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", + __func__, ti_locked)); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + INP_WLOCK_ASSERT(tp->t_inpcb); + + if (tp->t_flags & TF_DELACK) { + tp->t_flags &= ~TF_DELACK; + tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); + } + INP_WUNLOCK(tp->t_inpcb); +} + +/* + * The slow-path is the clone of the long long part + * of tcp_do_segment past all the fast-path stuff. We + * use it here by two different callers, the fast/slow and + * the fastack only. + */ +static void +tcp_do_slowpath(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, + int ti_locked, u_long tiwin, int thflags) +{ + int acked, ourfinisacked, needoutput = 0; + int rstreason, todrop, win; + char *s; + struct in_conninfo *inc; + struct mbuf *mfree = NULL; +#ifdef TCPDEBUG + /* + * The size of tcp_saveipgen must be the size of the max ip header, + * now IPv6. + */ + u_char tcp_saveipgen[IP6_HDR_LEN]; + struct tcphdr tcp_savetcp; + short ostate = 0; +#endif + /* + * Calculate amount of space in receive window, + * and then do TCP input processing. + * Receive window is amount of space in rcv queue, + * but not less than advertised window. + */ + inc = &tp->t_inpcb->inp_inc; + win = sbspace(&so->so_rcv); + if (win < 0) + win = 0; + tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); + + /* Reset receive buffer auto scaling when not in bulk receive mode. */ + tp->rfbuf_ts = 0; + tp->rfbuf_cnt = 0; + + switch (tp->t_state) { + + /* + * If the state is SYN_RECEIVED: + * if seg contains an ACK, but not for our SYN/ACK, send a RST. + */ + case TCPS_SYN_RECEIVED: + if ((thflags & TH_ACK) && + (SEQ_LEQ(th->th_ack, tp->snd_una) || + SEQ_GT(th->th_ack, tp->snd_max))) { + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } + break; + + /* + * If the state is SYN_SENT: + * if seg contains an ACK, but not for our SYN, drop the input. + * if seg contains a RST, then drop the connection. + * if seg does not contain SYN, then drop it. + * Otherwise this is an acceptable SYN segment + * initialize tp->rcv_nxt and tp->irs + * if seg contains ack then advance tp->snd_una + * if seg contains an ECE and ECN support is enabled, the stream + * is ECN capable. + * if SYN has been acked change to ESTABLISHED else SYN_RCVD state + * arrange for segment to be acked (eventually) + * continue processing rest of data/controls, beginning with URG + */ + case TCPS_SYN_SENT: + if ((thflags & TH_ACK) && + (SEQ_LEQ(th->th_ack, tp->iss) || + SEQ_GT(th->th_ack, tp->snd_max))) { + rstreason = BANDLIM_UNLIMITED; + goto dropwithreset; + } + if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) { + TCP_PROBE5(connect__refused, NULL, tp, + mtod(m, const char *), tp, th); + tp = tcp_drop(tp, ECONNREFUSED); + } + if (thflags & TH_RST) + goto drop; + if (!(thflags & TH_SYN)) + goto drop; + + tp->irs = th->th_seq; + tcp_rcvseqinit(tp); + if (thflags & TH_ACK) { + TCPSTAT_INC(tcps_connects); + soisconnected(so); +#ifdef MAC + mac_socketpeer_set_from_mbuf(m, so); +#endif + /* Do window scaling on this connection? */ + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) { + tp->rcv_scale = tp->request_r_scale; + } + tp->rcv_adv += imin(tp->rcv_wnd, + TCP_MAXWIN << tp->rcv_scale); + tp->snd_una++; /* SYN is acked */ + /* + * If there's data, delay ACK; if there's also a FIN + * ACKNOW will be turned on later. + */ + if (DELAY_ACK(tp, tlen) && tlen != 0) + tcp_timer_activate(tp, TT_DELACK, + tcp_delacktime); + else + tp->t_flags |= TF_ACKNOW; + + if ((thflags & TH_ECE) && V_tcp_do_ecn) { + tp->t_flags |= TF_ECN_PERMIT; + TCPSTAT_INC(tcps_ecn_shs); + } + + /* + * Received in SYN_SENT[*] state. + * Transitions: + * SYN_SENT --> ESTABLISHED + * SYN_SENT* --> FIN_WAIT_1 + */ + tp->t_starttime = ticks; + if (tp->t_flags & TF_NEEDFIN) { + tcp_state_change(tp, TCPS_FIN_WAIT_1); + tp->t_flags &= ~TF_NEEDFIN; + thflags &= ~TH_SYN; + } else { + tcp_state_change(tp, TCPS_ESTABLISHED); + TCP_PROBE5(connect__established, NULL, tp, + mtod(m, const char *), tp, th); + cc_conn_init(tp); + tcp_timer_activate(tp, TT_KEEP, + TP_KEEPIDLE(tp)); + } + } else { + /* + * Received initial SYN in SYN-SENT[*] state => + * simultaneous open. + * If it succeeds, connection is * half-synchronized. + * Otherwise, do 3-way handshake: + * SYN-SENT -> SYN-RECEIVED + * SYN-SENT* -> SYN-RECEIVED* + */ + tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); + tcp_timer_activate(tp, TT_REXMT, 0); + tcp_state_change(tp, TCPS_SYN_RECEIVED); + } + + KASSERT(ti_locked == TI_RLOCKED, ("%s: trimthenstep6: " + "ti_locked %d", __func__, ti_locked)); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + INP_WLOCK_ASSERT(tp->t_inpcb); + + /* + * Advance th->th_seq to correspond to first data byte. + * If data, trim to stay within window, + * dropping FIN if necessary. + */ + th->th_seq++; + if (tlen > tp->rcv_wnd) { + todrop = tlen - tp->rcv_wnd; + m_adj(m, -todrop); + tlen = tp->rcv_wnd; + thflags &= ~TH_FIN; + TCPSTAT_INC(tcps_rcvpackafterwin); + TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); + } + tp->snd_wl1 = th->th_seq - 1; + tp->rcv_up = th->th_seq; + /* + * Client side of transaction: already sent SYN and data. + * If the remote host used T/TCP to validate the SYN, + * our data will be ACK'd; if so, enter normal data segment + * processing in the middle of step 5, ack processing. + * Otherwise, goto step 6. + */ + if (thflags & TH_ACK) + goto process_ACK; + + goto step6; + + /* + * If the state is LAST_ACK or CLOSING or TIME_WAIT: + * do normal processing. + * + * NB: Leftover from RFC1644 T/TCP. Cases to be reused later. + */ + case TCPS_LAST_ACK: + case TCPS_CLOSING: + break; /* continue normal processing */ + } + + /* + * States other than LISTEN or SYN_SENT. + * First check the RST flag and sequence number since reset segments + * are exempt from the timestamp and connection count tests. This + * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix + * below which allowed reset segments in half the sequence space + * to fall though and be processed (which gives forged reset + * segments with a random sequence number a 50 percent chance of + * killing a connection). + * Then check timestamp, if present. + * Then check the connection count, if present. + * Then check that at least some bytes of segment are within + * receive window. If segment begins before rcv_nxt, + * drop leading data (and SYN); if nothing left, just ack. + */ + if (thflags & TH_RST) { + /* + * RFC5961 Section 3.2 + * + * - RST drops connection only if SEG.SEQ == RCV.NXT. + * - If RST is in window, we send challenge ACK. + * + * Note: to take into account delayed ACKs, we should + * test against last_ack_sent instead of rcv_nxt. + * Note 2: we handle special case of closed window, not + * covered by the RFC. + */ + if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || + (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + KASSERT(ti_locked == TI_RLOCKED, + ("%s: TH_RST ti_locked %d, th %p tp %p", + __func__, ti_locked, th, tp)); + KASSERT(tp->t_state != TCPS_SYN_SENT, + ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", + __func__, th, tp)); + + if (V_tcp_insecure_rst || + tp->last_ack_sent == th->th_seq) { + TCPSTAT_INC(tcps_drops); + /* Drop the connection. */ + switch (tp->t_state) { + case TCPS_SYN_RECEIVED: + so->so_error = ECONNREFUSED; + goto close; + case TCPS_ESTABLISHED: + case TCPS_FIN_WAIT_1: + case TCPS_FIN_WAIT_2: + case TCPS_CLOSE_WAIT: + so->so_error = ECONNRESET; + close: + tcp_state_change(tp, TCPS_CLOSED); + /* FALLTHROUGH */ + default: + tp = tcp_close(tp); + } + } else { + TCPSTAT_INC(tcps_badrst); + /* Send challenge ACK. */ + tcp_respond(tp, mtod(m, void *), th, m, + tp->rcv_nxt, tp->snd_nxt, TH_ACK); + tp->last_ack_sent = tp->rcv_nxt; + m = NULL; + } + } + goto drop; + } + + /* + * RFC5961 Section 4.2 + * Send challenge ACK for any SYN in synchronized state. + */ + if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT) { + KASSERT(ti_locked == TI_RLOCKED, + ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked)); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + + TCPSTAT_INC(tcps_badsyn); + if (V_tcp_insecure_syn && + SEQ_GEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { + tp = tcp_drop(tp, ECONNRESET); + rstreason = BANDLIM_UNLIMITED; + } else { + /* Send challenge ACK. */ + tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, + tp->snd_nxt, TH_ACK); + tp->last_ack_sent = tp->rcv_nxt; + m = NULL; + } + goto drop; + } + + /* + * RFC 1323 PAWS: If we have a timestamp reply on this segment + * and it's less than ts_recent, drop it. + */ + if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && + TSTMP_LT(to->to_tsval, tp->ts_recent)) { + + /* Check to see if ts_recent is over 24 days old. */ + if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { + /* + * Invalidate ts_recent. If this segment updates + * ts_recent, the age will be reset later and ts_recent + * will get a valid value. If it does not, setting + * ts_recent to zero will at least satisfy the + * requirement that zero be placed in the timestamp + * echo reply when ts_recent isn't valid. The + * age isn't reset until we get a valid ts_recent + * because we don't want out-of-order segments to be + * dropped when ts_recent is old. + */ + tp->ts_recent = 0; + } else { + TCPSTAT_INC(tcps_rcvduppack); + TCPSTAT_ADD(tcps_rcvdupbyte, tlen); + TCPSTAT_INC(tcps_pawsdrop); + if (tlen) + goto dropafterack; + goto drop; + } + } + + /* + * In the SYN-RECEIVED state, validate that the packet belongs to + * this connection before trimming the data to fit the receive + * window. Check the sequence number versus IRS since we know + * the sequence numbers haven't wrapped. This is a partial fix + * for the "LAND" DoS attack. + */ + if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } + + todrop = tp->rcv_nxt - th->th_seq; + if (todrop > 0) { + if (thflags & TH_SYN) { + thflags &= ~TH_SYN; + th->th_seq++; + if (th->th_urp > 1) + th->th_urp--; + else + thflags &= ~TH_URG; + todrop--; + } + /* + * Following if statement from Stevens, vol. 2, p. 960. + */ + if (todrop > tlen + || (todrop == tlen && (thflags & TH_FIN) == 0)) { + /* + * Any valid FIN must be to the left of the window. + * At this point the FIN must be a duplicate or out + * of sequence; drop it. + */ + thflags &= ~TH_FIN; + + /* + * Send an ACK to resynchronize and drop any data. + * But keep on processing for RST or ACK. + */ + tp->t_flags |= TF_ACKNOW; + todrop = tlen; + TCPSTAT_INC(tcps_rcvduppack); + TCPSTAT_ADD(tcps_rcvdupbyte, todrop); + } else { + TCPSTAT_INC(tcps_rcvpartduppack); + TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); + } + drop_hdrlen += todrop; /* drop from the top afterwards */ + th->th_seq += todrop; + tlen -= todrop; + if (th->th_urp > todrop) + th->th_urp -= todrop; + else { + thflags &= ~TH_URG; + th->th_urp = 0; + } + } + + /* + * If new data are received on a connection after the + * user processes are gone, then RST the other end. + */ + if ((so->so_state & SS_NOFDREF) && + tp->t_state > TCPS_CLOSE_WAIT && tlen) { + KASSERT(ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && " + "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked)); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { + log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data " + "after socket was closed, " + "sending RST and removing tcpcb\n", + s, __func__, tcpstates[tp->t_state], tlen); + free(s, M_TCPLOG); + } + tp = tcp_close(tp); + TCPSTAT_INC(tcps_rcvafterclose); + rstreason = BANDLIM_UNLIMITED; + goto dropwithreset; + } + + /* + * If segment ends after window, drop trailing data + * (and PUSH and FIN); if nothing left, just ACK. + */ + todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); + if (todrop > 0) { + TCPSTAT_INC(tcps_rcvpackafterwin); + if (todrop >= tlen) { + TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); + /* + * If window is closed can only take segments at + * window edge, and have to drop data and PUSH from + * incoming segments. Continue processing, but + * remember to ack. Otherwise, drop segment + * and ack. + */ + if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { + tp->t_flags |= TF_ACKNOW; + TCPSTAT_INC(tcps_rcvwinprobe); + } else + goto dropafterack; + } else + TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); + m_adj(m, -todrop); + tlen -= todrop; + thflags &= ~(TH_PUSH|TH_FIN); + } + + /* + * If last ACK falls within this segment's sequence numbers, + * record its timestamp. + * NOTE: + * 1) That the test incorporates suggestions from the latest + * proposal of the tcplw@cray.com list (Braden 1993/04/26). + * 2) That updating only on newer timestamps interferes with + * our earlier PAWS tests, so this check should be solely + * predicated on the sequence space of this segment. + * 3) That we modify the segment boundary check to be + * Last.ACK.Sent <= SEG.SEQ + SEG.Len + * instead of RFC1323's + * Last.ACK.Sent < SEG.SEQ + SEG.Len, + * This modified check allows us to overcome RFC1323's + * limitations as described in Stevens TCP/IP Illustrated + * Vol. 2 p.869. In such cases, we can still calculate the + * RTT correctly when RCV.NXT == Last.ACK.Sent. + */ + if ((to->to_flags & TOF_TS) != 0 && + SEQ_LEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + + ((thflags & (TH_SYN|TH_FIN)) != 0))) { + tp->ts_recent_age = tcp_ts_getticks(); + tp->ts_recent = to->to_tsval; + } + + /* + * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN + * flag is on (half-synchronized state), then queue data for + * later processing; else drop segment and return. + */ + if ((thflags & TH_ACK) == 0) { + if (tp->t_state == TCPS_SYN_RECEIVED || + (tp->t_flags & TF_NEEDSYN)) + goto step6; + else if (tp->t_flags & TF_ACKNOW) + goto dropafterack; + else + goto drop; + } + + /* + * Ack processing. + */ + switch (tp->t_state) { + + /* + * In SYN_RECEIVED state, the ack ACKs our SYN, so enter + * ESTABLISHED state and continue processing. + * The ACK was checked above. + */ + case TCPS_SYN_RECEIVED: + + TCPSTAT_INC(tcps_connects); + soisconnected(so); + /* Do window scaling? */ + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) { + tp->rcv_scale = tp->request_r_scale; + tp->snd_wnd = tiwin; + } + /* + * Make transitions: + * SYN-RECEIVED -> ESTABLISHED + * SYN-RECEIVED* -> FIN-WAIT-1 + */ + tp->t_starttime = ticks; + if (tp->t_flags & TF_NEEDFIN) { + tcp_state_change(tp, TCPS_FIN_WAIT_1); + tp->t_flags &= ~TF_NEEDFIN; + } else { + tcp_state_change(tp, TCPS_ESTABLISHED); + TCP_PROBE5(accept__established, NULL, tp, + mtod(m, const char *), tp, th); + cc_conn_init(tp); + tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); + } + /* + * If segment contains data or ACK, will call tcp_reass() + * later; if not, do so now to pass queued data to user. + */ + if (tlen == 0 && (thflags & TH_FIN) == 0) + (void) tcp_reass(tp, (struct tcphdr *)0, 0, + (struct mbuf *)0); + tp->snd_wl1 = th->th_seq - 1; + /* FALLTHROUGH */ + + /* + * In ESTABLISHED state: drop duplicate ACKs; ACK out of range + * ACKs. If the ack is in the range + * tp->snd_una < th->th_ack <= tp->snd_max + * then advance tp->snd_una to th->th_ack and drop + * data from the retransmission queue. If this ACK reflects + * more up to date window information we update our window information. + */ + case TCPS_ESTABLISHED: + case TCPS_FIN_WAIT_1: + case TCPS_FIN_WAIT_2: + case TCPS_CLOSE_WAIT: + case TCPS_CLOSING: + case TCPS_LAST_ACK: + if (SEQ_GT(th->th_ack, tp->snd_max)) { + TCPSTAT_INC(tcps_rcvacktoomuch); + goto dropafterack; + } + if ((tp->t_flags & TF_SACK_PERMIT) && + ((to->to_flags & TOF_SACK) || + !TAILQ_EMPTY(&tp->snd_holes))) + tcp_sack_doack(tp, to, th->th_ack); + else + /* + * Reset the value so that previous (valid) value + * from the last ack with SACK doesn't get used. + */ + tp->sackhint.sacked_bytes = 0; + + /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ + hhook_run_tcp_est_in(tp, th, to); + + if (SEQ_LEQ(th->th_ack, tp->snd_una)) { + if (tlen == 0 && tiwin == tp->snd_wnd) { + /* + * If this is the first time we've seen a + * FIN from the remote, this is not a + * duplicate and it needs to be processed + * normally. This happens during a + * simultaneous close. + */ + if ((thflags & TH_FIN) && + (TCPS_HAVERCVDFIN(tp->t_state) == 0)) { + tp->t_dupacks = 0; + break; + } + TCPSTAT_INC(tcps_rcvdupack); + /* + * If we have outstanding data (other than + * a window probe), this is a completely + * duplicate ack (ie, window info didn't + * change and FIN isn't set), + * the ack is the biggest we've + * seen and we've seen exactly our rexmt + * threshhold of them, assume a packet + * has been dropped and retransmit it. + * Kludge snd_nxt & the congestion + * window so we send only this one + * packet. + * + * We know we're losing at the current + * window size so do congestion avoidance + * (set ssthresh to half the current window + * and pull our congestion window back to + * the new ssthresh). + * + * Dup acks mean that packets have left the + * network (they're now cached at the receiver) + * so bump cwnd by the amount in the receiver + * to keep a constant cwnd packets in the + * network. + * + * When using TCP ECN, notify the peer that + * we reduced the cwnd. + */ + if (!tcp_timer_active(tp, TT_REXMT) || + th->th_ack != tp->snd_una) + tp->t_dupacks = 0; + else if (++tp->t_dupacks > tcprexmtthresh || + IN_FASTRECOVERY(tp->t_flags)) { + cc_ack_received(tp, th, CC_DUPACK); + if ((tp->t_flags & TF_SACK_PERMIT) && + IN_FASTRECOVERY(tp->t_flags)) { + int awnd; + + /* + * Compute the amount of data in flight first. + * We can inject new data into the pipe iff + * we have less than 1/2 the original window's + * worth of data in flight. + */ + if (V_tcp_do_rfc6675_pipe) + awnd = tcp_compute_pipe(tp); + else + awnd = (tp->snd_nxt - tp->snd_fack) + + tp->sackhint.sack_bytes_rexmit; + + if (awnd < tp->snd_ssthresh) { + tp->snd_cwnd += tp->t_maxseg; + if (tp->snd_cwnd > tp->snd_ssthresh) + tp->snd_cwnd = tp->snd_ssthresh; + } + } else + tp->snd_cwnd += tp->t_maxseg; + (void) tp->t_fb->tfb_tcp_output(tp); + goto drop; + } else if (tp->t_dupacks == tcprexmtthresh) { + tcp_seq onxt = tp->snd_nxt; + + /* + * If we're doing sack, check to + * see if we're already in sack + * recovery. If we're not doing sack, + * check to see if we're in newreno + * recovery. + */ + if (tp->t_flags & TF_SACK_PERMIT) { + if (IN_FASTRECOVERY(tp->t_flags)) { + tp->t_dupacks = 0; + break; + } + } else { + if (SEQ_LEQ(th->th_ack, + tp->snd_recover)) { + tp->t_dupacks = 0; + break; + } + } + /* Congestion signal before ack. */ + cc_cong_signal(tp, th, CC_NDUPACK); + cc_ack_received(tp, th, CC_DUPACK); + tcp_timer_activate(tp, TT_REXMT, 0); + tp->t_rtttime = 0; + if (tp->t_flags & TF_SACK_PERMIT) { + TCPSTAT_INC( + tcps_sack_recovery_episode); + tp->sack_newdata = tp->snd_nxt; + tp->snd_cwnd = tp->t_maxseg; + (void) tp->t_fb->tfb_tcp_output(tp); + goto drop; + } + tp->snd_nxt = th->th_ack; + tp->snd_cwnd = tp->t_maxseg; + (void) tp->t_fb->tfb_tcp_output(tp); + KASSERT(tp->snd_limited <= 2, + ("%s: tp->snd_limited too big", + __func__)); + tp->snd_cwnd = tp->snd_ssthresh + + tp->t_maxseg * + (tp->t_dupacks - tp->snd_limited); + if (SEQ_GT(onxt, tp->snd_nxt)) + tp->snd_nxt = onxt; + goto drop; + } else if (V_tcp_do_rfc3042) { + /* + * Process first and second duplicate + * ACKs. Each indicates a segment + * leaving the network, creating room + * for more. Make sure we can send a + * packet on reception of each duplicate + * ACK by increasing snd_cwnd by one + * segment. Restore the original + * snd_cwnd after packet transmission. + */ + cc_ack_received(tp, th, CC_DUPACK); + u_long oldcwnd = tp->snd_cwnd; + tcp_seq oldsndmax = tp->snd_max; + u_int sent; + int avail; + + KASSERT(tp->t_dupacks == 1 || + tp->t_dupacks == 2, + ("%s: dupacks not 1 or 2", + __func__)); + if (tp->t_dupacks == 1) + tp->snd_limited = 0; + tp->snd_cwnd = + (tp->snd_nxt - tp->snd_una) + + (tp->t_dupacks - tp->snd_limited) * + tp->t_maxseg; + /* + * Only call tcp_output when there + * is new data available to be sent. + * Otherwise we would send pure ACKs. + */ + SOCKBUF_LOCK(&so->so_snd); + avail = sbavail(&so->so_snd) - + (tp->snd_nxt - tp->snd_una); + SOCKBUF_UNLOCK(&so->so_snd); + if (avail > 0) + (void) tp->t_fb->tfb_tcp_output(tp); + sent = tp->snd_max - oldsndmax; + if (sent > tp->t_maxseg) { + KASSERT((tp->t_dupacks == 2 && + tp->snd_limited == 0) || + (sent == tp->t_maxseg + 1 && + tp->t_flags & TF_SENTFIN), + ("%s: sent too much", + __func__)); + tp->snd_limited = 2; + } else if (sent > 0) + ++tp->snd_limited; + tp->snd_cwnd = oldcwnd; + goto drop; + } + } else + tp->t_dupacks = 0; + break; + } + + KASSERT(SEQ_GT(th->th_ack, tp->snd_una), + ("%s: th_ack <= snd_una", __func__)); + + /* + * If the congestion window was inflated to account + * for the other side's cached packets, retract it. + */ + if (IN_FASTRECOVERY(tp->t_flags)) { + if (SEQ_LT(th->th_ack, tp->snd_recover)) { + if (tp->t_flags & TF_SACK_PERMIT) + tcp_sack_partialack(tp, th); + else + tcp_newreno_partial_ack(tp, th); + } else + cc_post_recovery(tp, th); + } + tp->t_dupacks = 0; + /* + * If we reach this point, ACK is not a duplicate, + * i.e., it ACKs something we sent. + */ + if (tp->t_flags & TF_NEEDSYN) { + /* + * T/TCP: Connection was half-synchronized, and our + * SYN has been ACK'd (so connection is now fully + * synchronized). Go to non-starred state, + * increment snd_una for ACK of SYN, and check if + * we can do window scaling. + */ + tp->t_flags &= ~TF_NEEDSYN; + tp->snd_una++; + /* Do window scaling? */ + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) { + tp->rcv_scale = tp->request_r_scale; + /* Send window already scaled. */ + } + } + +process_ACK: + INP_WLOCK_ASSERT(tp->t_inpcb); + + acked = BYTES_THIS_ACK(tp, th); + TCPSTAT_INC(tcps_rcvackpack); + TCPSTAT_ADD(tcps_rcvackbyte, acked); + + /* + * If we just performed our first retransmit, and the ACK + * arrives within our recovery window, then it was a mistake + * to do the retransmit in the first place. Recover our + * original cwnd and ssthresh, and proceed to transmit where + * we left off. + */ + if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && + (int)(ticks - tp->t_badrxtwin) < 0) + cc_cong_signal(tp, th, CC_RTO_ERR); + + /* + * If we have a timestamp reply, update smoothed + * round trip time. If no timestamp is present but + * transmit timer is running and timed sequence + * number was acked, update smoothed round trip time. + * Since we now have an rtt measurement, cancel the + * timer backoff (cf., Phil Karn's retransmit alg.). + * Recompute the initial retransmit timer. + * + * Some boxes send broken timestamp replies + * during the SYN+ACK phase, ignore + * timestamps of 0 or we could calculate a + * huge RTT and blow up the retransmit timer. + */ + if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { + u_int t; + + t = tcp_ts_getticks() - to->to_tsecr; + if (!tp->t_rttlow || tp->t_rttlow > t) + tp->t_rttlow = t; + tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); + } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { + if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) + tp->t_rttlow = ticks - tp->t_rtttime; + tcp_xmit_timer(tp, ticks - tp->t_rtttime); + } + + /* + * If all outstanding data is acked, stop retransmit + * timer and remember to restart (more output or persist). + * If there is more data to be acked, restart retransmit + * timer, using current (possibly backed-off) value. + */ + if (th->th_ack == tp->snd_max) { + tcp_timer_activate(tp, TT_REXMT, 0); + needoutput = 1; + } else if (!tcp_timer_active(tp, TT_PERSIST)) + tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); + + /* + * If no data (only SYN) was ACK'd, + * skip rest of ACK processing. + */ + if (acked == 0) + goto step6; + + /* + * Let the congestion control algorithm update congestion + * control related information. This typically means increasing + * the congestion window. + */ + cc_ack_received(tp, th, CC_ACK); + + SOCKBUF_LOCK(&so->so_snd); + if (acked > sbavail(&so->so_snd)) { + tp->snd_wnd -= sbavail(&so->so_snd); + mfree = sbcut_locked(&so->so_snd, + (int)sbavail(&so->so_snd)); + ourfinisacked = 1; + } else { + mfree = sbcut_locked(&so->so_snd, acked); + tp->snd_wnd -= acked; + ourfinisacked = 0; + } + /* NB: sowwakeup_locked() does an implicit unlock. */ + sowwakeup_locked(so); + m_freem(mfree); + /* Detect una wraparound. */ + if (!IN_RECOVERY(tp->t_flags) && + SEQ_GT(tp->snd_una, tp->snd_recover) && + SEQ_LEQ(th->th_ack, tp->snd_recover)) + tp->snd_recover = th->th_ack - 1; + /* XXXLAS: Can this be moved up into cc_post_recovery? */ + if (IN_RECOVERY(tp->t_flags) && + SEQ_GEQ(th->th_ack, tp->snd_recover)) { + EXIT_RECOVERY(tp->t_flags); + } + tp->snd_una = th->th_ack; + if (tp->t_flags & TF_SACK_PERMIT) { + if (SEQ_GT(tp->snd_una, tp->snd_recover)) + tp->snd_recover = tp->snd_una; + } + if (SEQ_LT(tp->snd_nxt, tp->snd_una)) + tp->snd_nxt = tp->snd_una; + + switch (tp->t_state) { + + /* + * In FIN_WAIT_1 STATE in addition to the processing + * for the ESTABLISHED state if our FIN is now acknowledged + * then enter FIN_WAIT_2. + */ + case TCPS_FIN_WAIT_1: + if (ourfinisacked) { + /* + * If we can't receive any more + * data, then closing user can proceed. + * Starting the timer is contrary to the + * specification, but if we don't get a FIN + * we'll hang forever. + * + * XXXjl: + * we should release the tp also, and use a + * compressed state. + */ + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { + soisdisconnected(so); + tcp_timer_activate(tp, TT_2MSL, + (tcp_fast_finwait2_recycle ? + tcp_finwait2_timeout : + TP_MAXIDLE(tp))); + } + tcp_state_change(tp, TCPS_FIN_WAIT_2); + } + break; + + /* + * In CLOSING STATE in addition to the processing for + * the ESTABLISHED state if the ACK acknowledges our FIN + * then enter the TIME-WAIT state, otherwise ignore + * the segment. + */ + case TCPS_CLOSING: + if (ourfinisacked) { + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + tcp_twstart(tp); + INP_INFO_RUNLOCK(&V_tcbinfo); + m_freem(m); + return; + } + break; + + /* + * In LAST_ACK, we may still be waiting for data to drain + * and/or to be acked, as well as for the ack of our FIN. + * If our FIN is now acknowledged, delete the TCB, + * enter the closed state and return. + */ + case TCPS_LAST_ACK: + if (ourfinisacked) { + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + tp = tcp_close(tp); + goto drop; + } + break; + } + } + +step6: + INP_WLOCK_ASSERT(tp->t_inpcb); + + /* + * Update window information. + * Don't look at window if no ACK: TAC's send garbage on first SYN. + */ + if ((thflags & TH_ACK) && + (SEQ_LT(tp->snd_wl1, th->th_seq) || + (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || + (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { + /* keep track of pure window updates */ + if (tlen == 0 && + tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) + TCPSTAT_INC(tcps_rcvwinupd); + tp->snd_wnd = tiwin; + tp->snd_wl1 = th->th_seq; + tp->snd_wl2 = th->th_ack; + if (tp->snd_wnd > tp->max_sndwnd) + tp->max_sndwnd = tp->snd_wnd; + needoutput = 1; + } + + /* + * Process segments with URG. + */ + if ((thflags & TH_URG) && th->th_urp && + TCPS_HAVERCVDFIN(tp->t_state) == 0) { + /* + * This is a kludge, but if we receive and accept + * random urgent pointers, we'll crash in + * soreceive. It's hard to imagine someone + * actually wanting to send this much urgent data. + */ + SOCKBUF_LOCK(&so->so_rcv); + if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { + th->th_urp = 0; /* XXX */ + thflags &= ~TH_URG; /* XXX */ + SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ + goto dodata; /* XXX */ + } + /* + * If this segment advances the known urgent pointer, + * then mark the data stream. This should not happen + * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since + * a FIN has been received from the remote side. + * In these states we ignore the URG. + * + * According to RFC961 (Assigned Protocols), + * the urgent pointer points to the last octet + * of urgent data. We continue, however, + * to consider it to indicate the first octet + * of data past the urgent section as the original + * spec states (in one of two places). + */ + if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { + tp->rcv_up = th->th_seq + th->th_urp; + so->so_oobmark = sbavail(&so->so_rcv) + + (tp->rcv_up - tp->rcv_nxt) - 1; + if (so->so_oobmark == 0) + so->so_rcv.sb_state |= SBS_RCVATMARK; + sohasoutofband(so); + tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); + } + SOCKBUF_UNLOCK(&so->so_rcv); + /* + * Remove out of band data so doesn't get presented to user. + * This can happen independent of advancing the URG pointer, + * but if two URG's are pending at once, some out-of-band + * data may creep in... ick. + */ + if (th->th_urp <= (u_long)tlen && + !(so->so_options & SO_OOBINLINE)) { + /* hdr drop is delayed */ + tcp_pulloutofband(so, th, m, drop_hdrlen); + } + } else { + /* + * If no out of band data is expected, + * pull receive urgent pointer along + * with the receive window. + */ + if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) + tp->rcv_up = tp->rcv_nxt; + } +dodata: /* XXX */ + INP_WLOCK_ASSERT(tp->t_inpcb); + + /* + * Process the segment text, merging it into the TCP sequencing queue, + * and arranging for acknowledgment of receipt if necessary. + * This process logically involves adjusting tp->rcv_wnd as data + * is presented to the user (this happens in tcp_usrreq.c, + * case PRU_RCVD). If a FIN has already been received on this + * connection then we just ignore the text. + */ + if ((tlen || (thflags & TH_FIN)) && + TCPS_HAVERCVDFIN(tp->t_state) == 0) { + tcp_seq save_start = th->th_seq; + m_adj(m, drop_hdrlen); /* delayed header drop */ + /* + * Insert segment which includes th into TCP reassembly queue + * with control block tp. Set thflags to whether reassembly now + * includes a segment with FIN. This handles the common case + * inline (segment is the next to be received on an established + * connection, and the queue is empty), avoiding linkage into + * and removal from the queue and repetition of various + * conversions. + * Set DELACK for segments received in order, but ack + * immediately when segments are out of order (so + * fast retransmit can work). + */ + if (th->th_seq == tp->rcv_nxt && + LIST_EMPTY(&tp->t_segq) && + TCPS_HAVEESTABLISHED(tp->t_state)) { + if (DELAY_ACK(tp, tlen)) + tp->t_flags |= TF_DELACK; + else + tp->t_flags |= TF_ACKNOW; + tp->rcv_nxt += tlen; + thflags = th->th_flags & TH_FIN; + TCPSTAT_INC(tcps_rcvpack); + TCPSTAT_ADD(tcps_rcvbyte, tlen); + SOCKBUF_LOCK(&so->so_rcv); + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) + m_freem(m); + else + sbappendstream_locked(&so->so_rcv, m, 0); + /* NB: sorwakeup_locked() does an implicit unlock. */ + sorwakeup_locked(so); + } else { + /* + * XXX: Due to the header drop above "th" is + * theoretically invalid by now. Fortunately + * m_adj() doesn't actually frees any mbufs + * when trimming from the head. + */ + thflags = tcp_reass(tp, th, &tlen, m); + tp->t_flags |= TF_ACKNOW; + } + if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT)) + tcp_update_sack_list(tp, save_start, save_start + tlen); +#if 0 + /* + * Note the amount of data that peer has sent into + * our window, in order to estimate the sender's + * buffer size. + * XXX: Unused. + */ + if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) + len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); + else + len = so->so_rcv.sb_hiwat; +#endif + } else { + m_freem(m); + thflags &= ~TH_FIN; + } + + /* + * If FIN is received ACK the FIN and let the user know + * that the connection is closing. + */ + if (thflags & TH_FIN) { + if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { + socantrcvmore(so); + /* + * If connection is half-synchronized + * (ie NEEDSYN flag on) then delay ACK, + * so it may be piggybacked when SYN is sent. + * Otherwise, since we received a FIN then no + * more input can be expected, send ACK now. + */ + if (tp->t_flags & TF_NEEDSYN) + tp->t_flags |= TF_DELACK; + else + tp->t_flags |= TF_ACKNOW; + tp->rcv_nxt++; + } + switch (tp->t_state) { + + /* + * In SYN_RECEIVED and ESTABLISHED STATES + * enter the CLOSE_WAIT state. + */ + case TCPS_SYN_RECEIVED: + tp->t_starttime = ticks; + /* FALLTHROUGH */ + case TCPS_ESTABLISHED: + tcp_state_change(tp, TCPS_CLOSE_WAIT); + break; + + /* + * If still in FIN_WAIT_1 STATE FIN has not been acked so + * enter the CLOSING state. + */ + case TCPS_FIN_WAIT_1: + tcp_state_change(tp, TCPS_CLOSING); + break; + + /* + * In FIN_WAIT_2 state enter the TIME_WAIT state, + * starting the time-wait timer, turning off the other + * standard timers. + */ + case TCPS_FIN_WAIT_2: + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + KASSERT(ti_locked == TI_RLOCKED, ("%s: dodata " + "TCP_FIN_WAIT_2 ti_locked: %d", __func__, + ti_locked)); + + tcp_twstart(tp); + INP_INFO_RUNLOCK(&V_tcbinfo); + return; + } + } + if (ti_locked == TI_RLOCKED) { + INP_INFO_RUNLOCK(&V_tcbinfo); + } + ti_locked = TI_UNLOCKED; + +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) + tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, + &tcp_savetcp, 0); +#endif + TCP_PROBE3(debug__input, tp, th, mtod(m, const char *)); + + /* + * Return any desired output. + */ + if (needoutput || (tp->t_flags & TF_ACKNOW)) + (void) tp->t_fb->tfb_tcp_output(tp); + + KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", + __func__, ti_locked)); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + INP_WLOCK_ASSERT(tp->t_inpcb); + + if (tp->t_flags & TF_DELACK) { + tp->t_flags &= ~TF_DELACK; + tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); + } + INP_WUNLOCK(tp->t_inpcb); + return; + +dropafterack: + /* + * Generate an ACK dropping incoming segment if it occupies + * sequence space, where the ACK reflects our state. + * + * We can now skip the test for the RST flag since all + * paths to this code happen after packets containing + * RST have been dropped. + * + * In the SYN-RECEIVED state, don't send an ACK unless the + * segment we received passes the SYN-RECEIVED ACK test. + * If it fails send a RST. This breaks the loop in the + * "LAND" DoS attack, and also prevents an ACK storm + * between two listening ports that have been sent forged + * SYN segments, each with the source address of the other. + */ + if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && + (SEQ_GT(tp->snd_una, th->th_ack) || + SEQ_GT(th->th_ack, tp->snd_max)) ) { + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) + tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, + &tcp_savetcp, 0); +#endif + TCP_PROBE3(debug__input, tp, th, mtod(m, const char *)); + if (ti_locked == TI_RLOCKED) { + INP_INFO_RUNLOCK(&V_tcbinfo); + } + ti_locked = TI_UNLOCKED; + + tp->t_flags |= TF_ACKNOW; + (void) tp->t_fb->tfb_tcp_output(tp); + INP_WUNLOCK(tp->t_inpcb); + m_freem(m); + return; + +dropwithreset: + if (ti_locked == TI_RLOCKED) { + INP_INFO_RUNLOCK(&V_tcbinfo); + } + ti_locked = TI_UNLOCKED; + + if (tp != NULL) { + tcp_dropwithreset(m, th, tp, tlen, rstreason); + INP_WUNLOCK(tp->t_inpcb); + } else + tcp_dropwithreset(m, th, NULL, tlen, rstreason); + return; + +drop: + if (ti_locked == TI_RLOCKED) { + INP_INFO_RUNLOCK(&V_tcbinfo); + ti_locked = TI_UNLOCKED; + } +#ifdef INVARIANTS + else + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); +#endif + + /* + * Drop space held by incoming segment and return. + */ +#ifdef TCPDEBUG + if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, + &tcp_savetcp, 0); +#endif + TCP_PROBE3(debug__input, tp, th, mtod(m, const char *)); + if (tp != NULL) + INP_WUNLOCK(tp->t_inpcb); + m_freem(m); +} + + +/* + * Do fast slow is a combination of the original + * tcp_dosegment and a split fastpath, one function + * for the fast-ack which also includes allowing fastpath + * for window advanced in sequence acks. And also a + * sub-function that handles the insequence data. + */ +void +tcp_do_segment_fastslow(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, + int ti_locked) +{ + int thflags; + u_long tiwin; + char *s; + int can_enter; + struct in_conninfo *inc; + struct tcpopt to; + + thflags = th->th_flags; + tp->sackhint.last_sack_ack = 0; + inc = &tp->t_inpcb->inp_inc; + /* + * If this is either a state-changing packet or current state isn't + * established, we require a write lock on tcbinfo. Otherwise, we + * allow the tcbinfo to be in either alocked or unlocked, as the + * caller may have unnecessarily acquired a write lock due to a race. + */ + if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || + tp->t_state != TCPS_ESTABLISHED) { + KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for " + "SYN/FIN/RST/!EST", __func__, ti_locked)); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + } else { +#ifdef INVARIANTS + if (ti_locked == TI_RLOCKED) { + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + } else { + KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " + "ti_locked: %d", __func__, ti_locked)); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + } +#endif + } + INP_WLOCK_ASSERT(tp->t_inpcb); + KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", + __func__)); + KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", + __func__)); + + /* + * Segment received on connection. + * Reset idle time and keep-alive timer. + * XXX: This should be done after segment + * validation to ignore broken/spoofed segs. + */ + tp->t_rcvtime = ticks; + if (TCPS_HAVEESTABLISHED(tp->t_state)) + tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); + + /* + * Unscale the window into a 32-bit value. + * For the SYN_SENT state the scale is zero. + */ + tiwin = th->th_win << tp->snd_scale; + + /* + * TCP ECN processing. + */ + if (tp->t_flags & TF_ECN_PERMIT) { + if (thflags & TH_CWR) + tp->t_flags &= ~TF_ECN_SND_ECE; + switch (iptos & IPTOS_ECN_MASK) { + case IPTOS_ECN_CE: + tp->t_flags |= TF_ECN_SND_ECE; + TCPSTAT_INC(tcps_ecn_ce); + break; + case IPTOS_ECN_ECT0: + TCPSTAT_INC(tcps_ecn_ect0); + break; + case IPTOS_ECN_ECT1: + TCPSTAT_INC(tcps_ecn_ect1); + break; + } + /* Congestion experienced. */ + if (thflags & TH_ECE) { + cc_cong_signal(tp, th, CC_ECN); + } + } + + /* + * Parse options on any incoming segment. + */ + tcp_dooptions(&to, (u_char *)(th + 1), + (th->th_off << 2) - sizeof(struct tcphdr), + (thflags & TH_SYN) ? TO_SYN : 0); + + /* + * If echoed timestamp is later than the current time, + * fall back to non RFC1323 RTT calculation. Normalize + * timestamp if syncookies were used when this connection + * was established. + */ + if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { + to.to_tsecr -= tp->ts_offset; + if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) + to.to_tsecr = 0; + } + /* + * If timestamps were negotiated during SYN/ACK they should + * appear on every segment during this session and vice versa. + */ + if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) { + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { + log(LOG_DEBUG, "%s; %s: Timestamp missing, " + "no action\n", s, __func__); + free(s, M_TCPLOG); + } + } + if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) { + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { + log(LOG_DEBUG, "%s; %s: Timestamp not expected, " + "no action\n", s, __func__); + free(s, M_TCPLOG); + } + } + + /* + * Process options only when we get SYN/ACK back. The SYN case + * for incoming connections is handled in tcp_syncache. + * According to RFC1323 the window field in a SYN (i.e., a + * or ) segment itself is never scaled. + * XXX this is traditional behavior, may need to be cleaned up. + */ + if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { + if ((to.to_flags & TOF_SCALE) && + (tp->t_flags & TF_REQ_SCALE)) { + tp->t_flags |= TF_RCVD_SCALE; + tp->snd_scale = to.to_wscale; + } + /* + * Initial send window. It will be updated with + * the next incoming segment to the scaled value. + */ + tp->snd_wnd = th->th_win; + if (to.to_flags & TOF_TS) { + tp->t_flags |= TF_RCVD_TSTMP; + tp->ts_recent = to.to_tsval; + tp->ts_recent_age = tcp_ts_getticks(); + } + if (to.to_flags & TOF_MSS) + tcp_mss(tp, to.to_mss); + if ((tp->t_flags & TF_SACK_PERMIT) && + (to.to_flags & TOF_SACKPERM) == 0) + tp->t_flags &= ~TF_SACK_PERMIT; + } + can_enter = 0; + if (__predict_true((tlen == 0))) { + /* + * The ack moved forward and we have a window (non-zero) + * + * The ack did not move forward, but the window increased. + */ + if (__predict_true((SEQ_GT(th->th_ack, tp->snd_una) && tiwin) || + ((th->th_ack == tp->snd_una) && tiwin && (tiwin > tp->snd_wnd)))) { + can_enter = 1; + } + } else { + /* + * Data incoming, use the old entry criteria + * for fast-path with data. + */ + if ((tiwin && tiwin == tp->snd_wnd)) { + can_enter = 1; + } + } + /* + * Header prediction: check for the two common cases + * of a uni-directional data xfer. If the packet has + * no control flags, is in-sequence, the window didn't + * change and we're not retransmitting, it's a + * candidate. If the length is zero and the ack moved + * forward, we're the sender side of the xfer. Just + * free the data acked & wake any higher level process + * that was blocked waiting for space. If the length + * is non-zero and the ack didn't move, we're the + * receiver side. If we're getting packets in-order + * (the reassembly queue is empty), add the data to + * the socket buffer and note that we need a delayed ack. + * Make sure that the hidden state-flags are also off. + * Since we check for TCPS_ESTABLISHED first, it can only + * be TH_NEEDSYN. + */ + if (__predict_true(tp->t_state == TCPS_ESTABLISHED && + th->th_seq == tp->rcv_nxt && + (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && + tp->snd_nxt == tp->snd_max && + can_enter && + ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && + LIST_EMPTY(&tp->t_segq) && + ((to.to_flags & TOF_TS) == 0 || + TSTMP_GEQ(to.to_tsval, tp->ts_recent)))) { + if (__predict_true((tlen == 0) && + (SEQ_LEQ(th->th_ack, tp->snd_max) && + !IN_RECOVERY(tp->t_flags) && + (to.to_flags & TOF_SACK) == 0 && + TAILQ_EMPTY(&tp->snd_holes)))) { + /* We are done */ + tcp_do_fastack(m, th, so, tp, &to, drop_hdrlen, tlen, + ti_locked, tiwin); + return; + } else if ((tlen) && + (th->th_ack == tp->snd_una && + tlen <= sbspace(&so->so_rcv))) { + tcp_do_fastnewdata(m, th, so, tp, &to, drop_hdrlen, tlen, + ti_locked, tiwin); + /* We are done */ + return; + } + } + tcp_do_slowpath(m, th, so, tp, &to, drop_hdrlen, tlen, + ti_locked, tiwin, thflags); +} + + +/* + * This subfunction is used to try to highly optimize the + * fast path. We again allow window updates that are + * in sequence to remain in the fast-path. We also add + * in the __predict's to attempt to help the compiler. + * Note that if we return a 0, then we can *not* process + * it and the caller should push the packet into the + * slow-path. + */ +static int +tcp_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, + int ti_locked, u_long tiwin) +{ + int acked; + int winup_only=0; +#ifdef TCPDEBUG + /* + * The size of tcp_saveipgen must be the size of the max ip header, + * now IPv6. + */ + u_char tcp_saveipgen[IP6_HDR_LEN]; + struct tcphdr tcp_savetcp; + short ostate = 0; +#endif + + + if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { + /* Old ack, behind (or duplicate to) the last one rcv'd */ + return (0); + } + if (__predict_false(th->th_ack == tp->snd_una) && + __predict_false(tiwin <= tp->snd_wnd)) { + /* duplicate ack a shrinking dup ack with shrinking window */ + return (0); + } + if (__predict_false(tiwin == 0)) { + /* zero window */ + return (0); + } + if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { + /* Above what we have sent? */ + return (0); + } + if (__predict_false(tp->snd_nxt != tp->snd_max)) { + /* We are retransmitting */ + return (0); + } + if (__predict_false(tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN))) { + /* We need a SYN or a FIN, unlikely.. */ + return (0); + } + if((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { + /* Timestamp is behind .. old ack with seq wrap? */ + return (0); + } + if (__predict_false(IN_RECOVERY(tp->t_flags))) { + /* Still recovering */ + return (0); + } + if (__predict_false(to->to_flags & TOF_SACK)) { + /* Sack included in the ack.. */ + return (0); + } + if (!TAILQ_EMPTY(&tp->snd_holes)) { + /* We have sack holes on our scoreboard */ + return (0); + } + /* Ok if we reach here, we can process a fast-ack */ + + /* Did the window get updated? */ + if (tiwin != tp->snd_wnd) { + /* keep track of pure window updates */ + if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) { + winup_only = 1; + TCPSTAT_INC(tcps_rcvwinupd); + } + tp->snd_wnd = tiwin; + tp->snd_wl1 = th->th_seq; + if (tp->snd_wnd > tp->max_sndwnd) + tp->max_sndwnd = tp->snd_wnd; + } + /* + * Pull snd_wl2 up to prevent seq wrap relative + * to th_ack. + */ + tp->snd_wl2 = th->th_ack; + /* + * If last ACK falls within this segment's sequence numbers, + * record the timestamp. + * NOTE that the test is modified according to the latest + * proposal of the tcplw@cray.com list (Braden 1993/04/26). + */ + if ((to->to_flags & TOF_TS) != 0 && + SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { + tp->ts_recent_age = tcp_ts_getticks(); + tp->ts_recent = to->to_tsval; + } + /* + * This is a pure ack for outstanding data. + */ + if (ti_locked == TI_RLOCKED) { + INP_INFO_RUNLOCK(&V_tcbinfo); + } + ti_locked = TI_UNLOCKED; + + TCPSTAT_INC(tcps_predack); + + /* + * "bad retransmit" recovery. + */ + if (tp->t_rxtshift == 1 && + tp->t_flags & TF_PREVVALID && + (int)(ticks - tp->t_badrxtwin) < 0) { + cc_cong_signal(tp, th, CC_RTO_ERR); + } + + /* + * Recalculate the transmit timer / rtt. + * + * Some boxes send broken timestamp replies + * during the SYN+ACK phase, ignore + * timestamps of 0 or we could calculate a + * huge RTT and blow up the retransmit timer. + */ + if ((to->to_flags & TOF_TS) != 0 && + to->to_tsecr) { + u_int t; + + t = tcp_ts_getticks() - to->to_tsecr; + if (!tp->t_rttlow || tp->t_rttlow > t) + tp->t_rttlow = t; + tcp_xmit_timer(tp, + TCP_TS_TO_TICKS(t) + 1); + } else if (tp->t_rtttime && + SEQ_GT(th->th_ack, tp->t_rtseq)) { + if (!tp->t_rttlow || + tp->t_rttlow > ticks - tp->t_rtttime) + tp->t_rttlow = ticks - tp->t_rtttime; + tcp_xmit_timer(tp, + ticks - tp->t_rtttime); + } + if (winup_only == 0) { + acked = BYTES_THIS_ACK(tp, th); + + /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ + hhook_run_tcp_est_in(tp, th, to); + + TCPSTAT_ADD(tcps_rcvackbyte, acked); + sbdrop(&so->so_snd, acked); + if (SEQ_GT(tp->snd_una, tp->snd_recover) && + SEQ_LEQ(th->th_ack, tp->snd_recover)) + tp->snd_recover = th->th_ack - 1; + + /* + * Let the congestion control algorithm update + * congestion control related information. This + * typically means increasing the congestion + * window. + */ + cc_ack_received(tp, th, CC_ACK); + + tp->snd_una = th->th_ack; + tp->t_dupacks = 0; + m_freem(m); + + /* + * If all outstanding data are acked, stop + * retransmit timer, otherwise restart timer + * using current (possibly backed-off) value. + * If process is waiting for space, + * wakeup/selwakeup/signal. If data + * are ready to send, let tcp_output + * decide between more output or persist. + */ +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) + tcp_trace(TA_INPUT, ostate, tp, + (void *)tcp_saveipgen, + &tcp_savetcp, 0); +#endif + if (tp->snd_una == tp->snd_max) + tcp_timer_activate(tp, TT_REXMT, 0); + else if (!tcp_timer_active(tp, TT_PERSIST)) + tcp_timer_activate(tp, TT_REXMT, + tp->t_rxtcur); + /* Wake up the socket if we have room to write more */ + sowwakeup(so); + } else { + /* + * Window update only, just free the mbufs and + * send out whatever we can. + */ + m_freem(m); + } + if (sbavail(&so->so_snd)) + (void) tcp_output(tp); + KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", + __func__, ti_locked)); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + INP_WLOCK_ASSERT(tp->t_inpcb); + + if (tp->t_flags & TF_DELACK) { + tp->t_flags &= ~TF_DELACK; + tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); + } + INP_WUNLOCK(tp->t_inpcb); + return (1); +} + +/* + * This tcp-do-segment concentrates on making the fastest + * ack processing path. It does not have a fast-path for + * data (it possibly could which would then eliminate the + * need for fast-slow above). For a content distributor having + * large outgoing elephants and very very little coming in + * having no fastpath for data does not really help (since you + * don't get much data in). The most important thing is + * processing ack's quickly and getting the rest of the data + * output to the peer as quickly as possible. This routine + * seems to be about an overall 3% faster then the old + * tcp_do_segment and keeps us in the fast-path for packets + * much more (by allowing window updates to also stay in the fastpath). + */ +void +tcp_do_segment_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, + int ti_locked) +{ + int thflags; + u_long tiwin; + char *s; + struct in_conninfo *inc; + struct tcpopt to; + + thflags = th->th_flags; + tp->sackhint.last_sack_ack = 0; + inc = &tp->t_inpcb->inp_inc; + /* + * If this is either a state-changing packet or current state isn't + * established, we require a write lock on tcbinfo. Otherwise, we + * allow the tcbinfo to be in either alocked or unlocked, as the + * caller may have unnecessarily acquired a write lock due to a race. + */ + if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || + tp->t_state != TCPS_ESTABLISHED) { + KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for " + "SYN/FIN/RST/!EST", __func__, ti_locked)); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + } else { +#ifdef INVARIANTS + if (ti_locked == TI_RLOCKED) { + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + } else { + KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " + "ti_locked: %d", __func__, ti_locked)); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + } +#endif + } + INP_WLOCK_ASSERT(tp->t_inpcb); + KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", + __func__)); + KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", + __func__)); + + /* + * Segment received on connection. + * Reset idle time and keep-alive timer. + * XXX: This should be done after segment + * validation to ignore broken/spoofed segs. + */ + tp->t_rcvtime = ticks; + if (TCPS_HAVEESTABLISHED(tp->t_state)) + tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); + + /* + * Unscale the window into a 32-bit value. + * For the SYN_SENT state the scale is zero. + */ + tiwin = th->th_win << tp->snd_scale; + + /* + * TCP ECN processing. + */ + if (tp->t_flags & TF_ECN_PERMIT) { + if (thflags & TH_CWR) + tp->t_flags &= ~TF_ECN_SND_ECE; + switch (iptos & IPTOS_ECN_MASK) { + case IPTOS_ECN_CE: + tp->t_flags |= TF_ECN_SND_ECE; + TCPSTAT_INC(tcps_ecn_ce); + break; + case IPTOS_ECN_ECT0: + TCPSTAT_INC(tcps_ecn_ect0); + break; + case IPTOS_ECN_ECT1: + TCPSTAT_INC(tcps_ecn_ect1); + break; + } + /* Congestion experienced. */ + if (thflags & TH_ECE) { + cc_cong_signal(tp, th, CC_ECN); + } + } + + /* + * Parse options on any incoming segment. + */ + tcp_dooptions(&to, (u_char *)(th + 1), + (th->th_off << 2) - sizeof(struct tcphdr), + (thflags & TH_SYN) ? TO_SYN : 0); + + /* + * If echoed timestamp is later than the current time, + * fall back to non RFC1323 RTT calculation. Normalize + * timestamp if syncookies were used when this connection + * was established. + */ + if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { + to.to_tsecr -= tp->ts_offset; + if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) + to.to_tsecr = 0; + } + /* + * If timestamps were negotiated during SYN/ACK they should + * appear on every segment during this session and vice versa. + */ + if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) { + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { + log(LOG_DEBUG, "%s; %s: Timestamp missing, " + "no action\n", s, __func__); + free(s, M_TCPLOG); + } + } + if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) { + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { + log(LOG_DEBUG, "%s; %s: Timestamp not expected, " + "no action\n", s, __func__); + free(s, M_TCPLOG); + } + } + + /* + * Process options only when we get SYN/ACK back. The SYN case + * for incoming connections is handled in tcp_syncache. + * According to RFC1323 the window field in a SYN (i.e., a + * or ) segment itself is never scaled. + * XXX this is traditional behavior, may need to be cleaned up. + */ + if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { + if ((to.to_flags & TOF_SCALE) && + (tp->t_flags & TF_REQ_SCALE)) { + tp->t_flags |= TF_RCVD_SCALE; + tp->snd_scale = to.to_wscale; + } + /* + * Initial send window. It will be updated with + * the next incoming segment to the scaled value. + */ + tp->snd_wnd = th->th_win; + if (to.to_flags & TOF_TS) { + tp->t_flags |= TF_RCVD_TSTMP; + tp->ts_recent = to.to_tsval; + tp->ts_recent_age = tcp_ts_getticks(); + } + if (to.to_flags & TOF_MSS) + tcp_mss(tp, to.to_mss); + if ((tp->t_flags & TF_SACK_PERMIT) && + (to.to_flags & TOF_SACKPERM) == 0) + tp->t_flags &= ~TF_SACK_PERMIT; + } + /* + * Header prediction: check for the two common cases + * of a uni-directional data xfer. If the packet has + * no control flags, is in-sequence, the window didn't + * change and we're not retransmitting, it's a + * candidate. If the length is zero and the ack moved + * forward, we're the sender side of the xfer. Just + * free the data acked & wake any higher level process + * that was blocked waiting for space. If the length + * is non-zero and the ack didn't move, we're the + * receiver side. If we're getting packets in-order + * (the reassembly queue is empty), add the data to + * the socket buffer and note that we need a delayed ack. + * Make sure that the hidden state-flags are also off. + * Since we check for TCPS_ESTABLISHED first, it can only + * be TH_NEEDSYN. + */ + if (__predict_true(tp->t_state == TCPS_ESTABLISHED) && + __predict_true(((to.to_flags & TOF_SACK) == 0)) && + __predict_true(tlen == 0) && + __predict_true((thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK) && + __predict_true(LIST_EMPTY(&tp->t_segq)) && + __predict_true(th->th_seq == tp->rcv_nxt)) { + if (tcp_fastack(m, th, so, tp, &to, drop_hdrlen, tlen, + ti_locked, tiwin)) { + return; + } + } + tcp_do_slowpath(m, th, so, tp, &to, drop_hdrlen, tlen, + ti_locked, tiwin, thflags); +} + +struct tcp_function_block __tcp_fastslow = { + "fastslow", + tcp_output, + tcp_do_segment_fastslow, + tcp_default_ctloutput, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + 0, + 0 + +}; + +struct tcp_function_block __tcp_fastack = { + "fastack", + tcp_output, + tcp_do_segment_fastack, + tcp_default_ctloutput, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + 0, + 0 +}; + +static int +tcp_addfastpaths(module_t mod, int type, void *data) +{ + int err=0; + + switch (type) { + case MOD_LOAD: + err = register_tcp_functions(&__tcp_fastack, M_WAITOK); + if (err) { + printf("Failed to register fastack module -- err:%d\n", err); + return(err); + } + err = register_tcp_functions(&__tcp_fastslow, M_WAITOK); + if (err) { + printf("Failed to register fastslow module -- err:%d\n", err); + deregister_tcp_functions(&__tcp_fastack); + return(err); + } + break; + case MOD_QUIESCE: + if ((__tcp_fastslow.tfb_refcnt) ||( __tcp_fastack.tfb_refcnt)) { + return(EBUSY); + } + break; + case MOD_UNLOAD: + err = deregister_tcp_functions(&__tcp_fastack); + if (err == EBUSY) + break; + err = deregister_tcp_functions(&__tcp_fastslow); + if (err == EBUSY) + break; + err = 0; + break; + default: + return (EOPNOTSUPP); + } + return (err); +} + +static moduledata_t new_tcp_fastpaths = { + .name = "tcp_fastpaths", + .evhand = tcp_addfastpaths, + .priv = 0 +}; + +MODULE_VERSION(kern_tcpfastpaths, 1); +DECLARE_MODULE(kern_tcpfastpaths, new_tcp_fastpaths, SI_SUB_PSEUDO, SI_ORDER_ANY); Index: sys/netinet/tcp_subr.c =================================================================== --- sys/netinet/tcp_subr.c +++ sys/netinet/tcp_subr.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #ifdef INET6 #include @@ -125,6 +126,8 @@ VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS; #endif +struct rwlock tcp_function_lock; + static int sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS) { @@ -236,6 +239,223 @@ void *ip4hdr, const void *ip6hdr); static void tcp_timer_discard(struct tcpcb *, uint32_t); + +static struct tcp_function_block tcp_def_funcblk = { + "default", + tcp_output, + tcp_do_segment, + tcp_default_ctloutput, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + 0, + 0 +}; + +struct tcp_funchead t_functions; +static struct tcp_function_block *tcp_func_set_ptr = &tcp_def_funcblk; + +static struct tcp_function_block * +find_tcp_functions_locked(struct tcp_function_set *fs) +{ + struct tcp_function *f; + struct tcp_function_block *blk=NULL; + + TAILQ_FOREACH(f, &t_functions, tf_next) { + if (strcmp(f->tf_fb->tfb_tcp_block_name, fs->function_set_name) == 0) { + blk = f->tf_fb; + break; + } + } + return(blk); +} + +static struct tcp_function_block * +find_tcp_fb_locked(struct tcp_function_block *blk, struct tcp_function **s) +{ + struct tcp_function_block *rblk=NULL; + struct tcp_function *f; + + TAILQ_FOREACH(f, &t_functions, tf_next) { + if (f->tf_fb == blk) { + rblk = blk; + if (s) { + *s = f; + } + break; + } + } + return (rblk); +} + +struct tcp_function_block * +find_and_ref_tcp_functions(struct tcp_function_set *fs) +{ + struct tcp_function_block *blk; + + rw_rlock(&tcp_function_lock); + blk = find_tcp_functions_locked(fs); + if (blk) + refcount_acquire(&blk->tfb_refcnt); + rw_runlock(&tcp_function_lock); + return(blk); +} + +struct tcp_function_block * +find_and_ref_tcp_fb(struct tcp_function_block *blk) +{ + struct tcp_function_block *rblk; + + rw_rlock(&tcp_function_lock); + rblk = find_tcp_fb_locked(blk, NULL); + if (rblk) + refcount_acquire(&rblk->tfb_refcnt); + rw_runlock(&tcp_function_lock); + return(rblk); +} + + +static int +sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS) +{ + int error=ENOENT; + struct tcp_function_set fs; + struct tcp_function_block *blk; + + memset(&fs, 0, sizeof(fs)); + rw_rlock(&tcp_function_lock); + blk = find_tcp_fb_locked(tcp_func_set_ptr, NULL); + if (blk) { + /* Found him */ + strcpy(fs.function_set_name, blk->tfb_tcp_block_name); + fs.pcbcnt = blk->tfb_refcnt; + } + rw_runlock(&tcp_function_lock); + error = sysctl_handle_string(oidp, fs.function_set_name, + sizeof(fs.function_set_name), req); + + /* Check for error or no change */ + if (error != 0 || req->newptr == NULL) + return(error); + + rw_wlock(&tcp_function_lock); + blk = find_tcp_functions_locked(&fs); + if ((blk == NULL) || + (blk->tfb_flags & TCP_FUNC_BEING_REMOVED)) { + error = ENOENT; + goto done; + } + tcp_func_set_ptr = blk; +done: + rw_wunlock(&tcp_function_lock); + return (error); +} + +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_default, + CTLTYPE_STRING | CTLFLAG_RW, + NULL, 0, sysctl_net_inet_default_tcp_functions, "A", + "Set/get the default TCP functions"); + +static int +sysctl_net_inet_list_available(SYSCTL_HANDLER_ARGS) +{ + int error=0, cnt, at; + struct tcp_function *f, *n; + char *buffer=NULL; + size_t sz; + + rw_rlock(&tcp_function_lock); + cnt = 0; + TAILQ_FOREACH(f, &t_functions, tf_next) { + cnt++; + } + rw_runlock(&tcp_function_lock); + + sz = ((cnt+1) * TCP_FUNCTION_NAME_LEN_MAX) + cnt; + buffer = malloc(sz, M_TEMP, M_WAITOK); + if (buffer == NULL) + return (ENOMEM); + + memset(buffer, 0, sz); + error = at = 0; + + n = NULL; + rw_rlock(&tcp_function_lock); + TAILQ_FOREACH_SAFE(f, &t_functions, tf_next, n) { + if (at+TCP_FUNCTION_NAME_LEN_MAX > sz) { + error = EOVERFLOW; + break; + } + strcpy(&buffer[at], f->tf_fb->tfb_tcp_block_name); + at += strlen(f->tf_fb->tfb_tcp_block_name); + if (n) { + buffer[at] = ','; + at++; + } + } + rw_runlock(&tcp_function_lock); + if (error == 0) { + sz = at + 1; + error = sysctl_handle_string(oidp, buffer, sz, req); + } + free(buffer, M_TEMP); + return (error); +} + +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available, + CTLTYPE_STRING|CTLFLAG_RD, + NULL, 0, sysctl_net_inet_list_available, "A", + "list available TCP Function sets"); + +static int +sysctl_net_inet_tcp_functions_list(SYSCTL_HANDLER_ARGS) +{ + int error=0, n, at; + struct tcp_function *f; + struct tcp_function_set *fs=NULL; + size_t sz; + + rw_rlock(&tcp_function_lock); + n = 0; + TAILQ_FOREACH(f, &t_functions, tf_next) { + n++; + } + rw_runlock(&tcp_function_lock); + if (req->oldptr == NULL) { + req->oldidx = ((n+2) * sizeof(struct tcp_function_set)); + return(0); + } + sz = n * sizeof(struct tcp_function_set); + fs = malloc(sz, M_TEMP, M_WAITOK); + if (fs == NULL) { + return(ENOMEM); + } + at = 0; + memset(fs, 0, sz); + rw_rlock(&tcp_function_lock); + TAILQ_FOREACH(f, &t_functions, tf_next) { + strcpy(fs[at].function_set_name, f->tf_fb->tfb_tcp_block_name); + fs[at].pcbcnt = f->tf_fb->tfb_refcnt; + at++; + if (at >= n) + break; + } + rw_runlock(&tcp_function_lock); + error = SYSCTL_OUT(req, fs, (at * sizeof(struct tcp_function_set))); + free(fs, M_TEMP); + return (error); +} + +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_list_detail, + CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_RD, 0, 0, + &sysctl_net_inet_tcp_functions_list, "I", + "List the TCP function stacks and there reference counts"); + + /* * Target size of TCP PCB hash tables. Must be a power of two. * @@ -263,6 +483,8 @@ #define V_tcpcb_zone VNET(tcpcb_zone) MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers"); +MALLOC_DEFINE(M_TCPFUNCTIONS, "tcpfunc", "TCP function set memory"); + static struct mtx isn_mtx; #define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF) @@ -311,6 +533,96 @@ return (hashsize); } +int +register_tcp_functions(struct tcp_function_block *blk, int wait) +{ + struct tcp_function_block *lblk; + struct tcp_function *n; + struct tcp_function_set fs; + + if ((blk->tfb_tcp_output == NULL) || + (blk->tfb_tcp_do_segment == NULL) || + (blk->tfb_tcp_ctloutput == NULL) || + (strlen(blk->tfb_tcp_block_name) == 0)) { + /* + * These functions are required and you + * need a name. + */ + return (EINVAL); + } + if (blk->tfb_tcp_timer_stop_all || + blk->tfb_tcp_timers_left || + blk->tfb_tcp_timer_activate || + blk->tfb_tcp_timer_active || + blk->tfb_tcp_timer_stop) { + /* + * If you define one timer function you + * must have them all. + */ + if ((blk->tfb_tcp_timer_stop_all == NULL) || + (blk->tfb_tcp_timers_left == NULL) || + (blk->tfb_tcp_timer_activate == NULL) || + (blk->tfb_tcp_timer_active == NULL) || + (blk->tfb_tcp_timer_stop == NULL)) { + return (EINVAL); + } + } + n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait); + if (n == NULL) { + return (ENOMEM); + } + n->tf_fb = blk; + strcpy(fs.function_set_name, blk->tfb_tcp_block_name); + rw_wlock(&tcp_function_lock); + lblk = find_tcp_functions_locked(&fs); + if (lblk) { + /* Duplicate name space not allowed */ + rw_wunlock(&tcp_function_lock); + free(n, M_TCPFUNCTIONS); + return (EALREADY); + } + refcount_init(&blk->tfb_refcnt, 0); + blk->tfb_flags = 0; + TAILQ_INSERT_TAIL(&t_functions, n, tf_next); + rw_wunlock(&tcp_function_lock); + return(0); +} + +int +deregister_tcp_functions(struct tcp_function_block *blk) +{ + struct tcp_function_block *lblk; + struct tcp_function *f; + int error=ENOENT; + + if (strcmp(blk->tfb_tcp_block_name, "default") == 0) { + /* You can't un-register the default */ + return (EPERM); + } + rw_wlock(&tcp_function_lock); + if (blk == tcp_func_set_ptr) { + /* You can't free the current default */ + rw_wunlock(&tcp_function_lock); + return (EBUSY); + } + if (blk->tfb_refcnt) { + /* Still tcb attached, mark it. */ + blk->tfb_flags |= TCP_FUNC_BEING_REMOVED; + rw_wunlock(&tcp_function_lock); + return (EBUSY); + } + lblk = find_tcp_fb_locked(blk, &f); + if (lblk) { + /* Found */ + TAILQ_REMOVE(&t_functions, f, tf_next); + f->tf_fb = NULL; + free(f, M_TCPFUNCTIONS); + error = 0; + } + rw_wunlock(&tcp_function_lock); + return (error); +} + void tcp_init(void) { @@ -325,7 +637,10 @@ if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, &V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); - + /* Setup the tcp function block list */ + TAILQ_INIT(&t_functions); + rw_init_flags(&tcp_function_lock, "tcp_func_lock" , 0); + register_tcp_functions(&tcp_def_funcblk, M_WAITOK); hashsize = TCBHASHSIZE; TUNABLE_INT_FETCH(tcbhash_tuneable, &hashsize); if (hashsize == 0) { @@ -768,7 +1083,13 @@ tp->ccv = &tm->ccv; tp->ccv->type = IPPROTO_TCP; tp->ccv->ccvc.tcp = tp; - + rw_rlock(&tcp_function_lock); + tp->t_fb = tcp_func_set_ptr; + refcount_acquire(&tp->t_fb->tfb_refcnt); + rw_runlock(&tcp_function_lock); + if (tp->t_fb->tfb_tcp_fb_init) { + (*tp->t_fb->tfb_tcp_fb_init)(tp); + } /* * Use the current system default CC algorithm. */ @@ -779,6 +1100,9 @@ if (CC_ALGO(tp)->cb_init != NULL) if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) { + if (tp->t_fb->tfb_tcp_fb_fini) + (*tp->t_fb->tfb_tcp_fb_fini)(tp); + refcount_release(&tp->t_fb->tfb_refcnt); uma_zfree(V_tcpcb_zone, tm); return (NULL); } @@ -785,6 +1109,9 @@ tp->osd = &tm->osd; if (khelp_init_osd(HELPER_CLASS_TCP, tp->osd)) { + if (tp->t_fb->tfb_tcp_fb_fini) + (*tp->t_fb->tfb_tcp_fb_fini)(tp); + refcount_release(&tp->t_fb->tfb_refcnt); uma_zfree(V_tcpcb_zone, tm); return (NULL); } @@ -925,7 +1252,7 @@ if (TCPS_HAVERCVDSYN(tp->t_state)) { tcp_state_change(tp, TCPS_CLOSED); - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); TCPSTAT_INC(tcps_drops); } else TCPSTAT_INC(tcps_conndrops); @@ -960,6 +1287,10 @@ tcp_timer_stop(tp, TT_KEEP); tcp_timer_stop(tp, TT_2MSL); tcp_timer_stop(tp, TT_DELACK); + if (tp->t_fb->tfb_tcp_timer_stop_all) { + /* Call the stop-all function of the methods */ + tp->t_fb->tfb_tcp_timer_stop_all(tp); + } /* * If we got enough samples through the srtt filter, @@ -1044,6 +1375,14 @@ inp->inp_ppcb = NULL; if ((tp->t_timers->tt_flags & TT_MASK) == 0) { /* We own the last reference on tcpcb, let's free it. */ + if ((tp->t_fb->tfb_tcp_timers_left) && + (tp->t_fb->tfb_tcp_timers_left(tp))) { + /* Some fb timers left running! */ + return; + } + if (tp->t_fb->tfb_tcp_fb_fini) + (*tp->t_fb->tfb_tcp_fb_fini)(tp); + refcount_release(&tp->t_fb->tfb_refcnt); tp->t_inpcb = NULL; uma_zfree(V_tcpcb_zone, tp); released = in_pcbrele_wlocked(inp); @@ -1105,6 +1444,14 @@ tp->t_timers->tt_flags &= ~timer_type; if ((tp->t_timers->tt_flags & TT_MASK) == 0) { /* We own the last reference on this tcpcb, let's free it. */ + if ((tp->t_fb->tfb_tcp_timers_left) && + (tp->t_fb->tfb_tcp_timers_left(tp))) { + /* Some fb timers left running! */ + goto leave; + } + if (tp->t_fb->tfb_tcp_fb_fini) + (*tp->t_fb->tfb_tcp_fb_fini)(tp); + refcount_release(&tp->t_fb->tfb_refcnt); tp->t_inpcb = NULL; uma_zfree(V_tcpcb_zone, tp); if (in_pcbrele_wlocked(inp)) { @@ -1113,6 +1460,7 @@ return; } } +leave: INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); @@ -1865,7 +2213,7 @@ tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_SACK_PERMIT) EXIT_FASTRECOVERY(tp->t_flags); - tcp_output(tp); + tp->t_fb->tfb_tcp_output(tp); } #ifdef INET Index: sys/netinet/tcp_syncache.c =================================================================== --- sys/netinet/tcp_syncache.c +++ sys/netinet/tcp_syncache.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -626,6 +627,7 @@ static struct socket * syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) { + struct tcp_function_block *blk; struct inpcb *inp = NULL; struct socket *so; struct tcpcb *tp; @@ -817,6 +819,26 @@ tp->irs = sc->sc_irs; tcp_rcvseqinit(tp); tcp_sendseqinit(tp); + blk = sototcpcb(lso)->t_fb; + if (blk != tp->t_fb) { + /* + * Our parents t_fb was not the default, + * we need to release our ref on tp->t_fb and + * pickup one on the new entry. + */ + struct tcp_function_block *rblk; + + rblk = find_and_ref_tcp_fb(blk); + KASSERT(rblk != NULL, + ("cannot find blk %p out of syncache?", blk)); + if (tp->t_fb->tfb_tcp_fb_fini) + (*tp->t_fb->tfb_tcp_fb_fini)(tp); + refcount_release(&tp->t_fb->tfb_refcnt); + tp->t_fb = rblk; + if (tp->t_fb->tfb_tcp_fb_init) { + (*tp->t_fb->tfb_tcp_fb_init)(tp); + } + } tp->snd_wl1 = sc->sc_irs; tp->snd_max = tp->iss + 1; tp->snd_nxt = tp->iss + 1; Index: sys/netinet/tcp_timer.c =================================================================== --- sys/netinet/tcp_timer.c +++ sys/netinet/tcp_timer.c @@ -292,7 +292,7 @@ tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_delack); - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); INP_WUNLOCK(inp); CURVNET_RESTORE(); } @@ -543,7 +543,7 @@ } tcp_setpersist(tp); tp->t_flags |= TF_FORCEDATA; - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); tp->t_flags &= ~TF_FORCEDATA; out: @@ -798,7 +798,7 @@ cc_cong_signal(tp, NULL, CC_RTO); - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); out: #ifdef TCPDEBUG @@ -858,6 +858,10 @@ f_reset = TT_2MSL_RST; break; default: + if (tp->t_fb->tfb_tcp_timer_activate) { + tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta); + return; + } panic("tp %p bad timer_type %#x", tp, timer_type); } if (delta == 0) { @@ -904,6 +908,9 @@ t_callout = &tp->t_timers->tt_2msl; break; default: + if (tp->t_fb->tfb_tcp_timer_active) { + return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type)); + } panic("tp %p bad timer_type %#x", tp, timer_type); } return callout_active(t_callout); @@ -945,6 +952,14 @@ f_reset = TT_2MSL_RST; break; default: + if (tp->t_fb->tfb_tcp_timer_stop) { + /* + * XXXrrs we need to look at this with the + * stop case below (flags). + */ + tp->t_fb->tfb_tcp_timer_stop(tp, timer_type); + return; + } panic("tp %p bad timer_type %#x", tp, timer_type); } Index: sys/netinet/tcp_usrreq.c =================================================================== --- sys/netinet/tcp_usrreq.c +++ sys/netinet/tcp_usrreq.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -509,7 +510,7 @@ goto out; #endif tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); - error = tcp_output(tp); + error = tp->t_fb->tfb_tcp_output(tp); out: TCPDEBUG2(PRU_CONNECT); INP_WUNLOCK(inp); @@ -579,7 +580,7 @@ (error = tcp_offload_connect(so, nam)) == 0) goto out; #endif - error = tcp_output(tp); + error = tp->t_fb->tfb_tcp_output(tp); goto out; } #endif @@ -597,7 +598,7 @@ goto out; #endif tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); - error = tcp_output(tp); + error = tp->t_fb->tfb_tcp_output(tp); out: TCPDEBUG2(PRU_CONNECT); @@ -773,7 +774,7 @@ socantsendmore(so); tcp_usrclosed(tp); if (!(inp->inp_flags & INP_DROPPED)) - error = tcp_output(tp); + error = tp->t_fb->tfb_tcp_output(tp); out: TCPDEBUG2(PRU_SHUTDOWN); @@ -809,7 +810,7 @@ tcp_offload_rcvd(tp); else #endif - tcp_output(tp); + tp->t_fb->tfb_tcp_output(tp); out: TCPDEBUG2(PRU_RCVD); @@ -911,7 +912,7 @@ !(flags & PRUS_NOTREADY)) { if (flags & PRUS_MORETOCOME) tp->t_flags |= TF_MORETOCOME; - error = tcp_output(tp); + error = tp->t_fb->tfb_tcp_output(tp); if (flags & PRUS_MORETOCOME) tp->t_flags &= ~TF_MORETOCOME; } @@ -961,7 +962,7 @@ tp->snd_up = tp->snd_una + sbavail(&so->so_snd); if (!(flags & PRUS_NOTREADY)) { tp->t_flags |= TF_FORCEDATA; - error = tcp_output(tp); + error = tp->t_fb->tfb_tcp_output(tp); tp->t_flags &= ~TF_FORCEDATA; } } @@ -997,7 +998,7 @@ error = sbready(&so->so_snd, m, count); SOCKBUF_UNLOCK(&so->so_snd); if (error == 0) - error = tcp_output(tp); + error = tp->t_fb->tfb_tcp_output(tp); INP_WUNLOCK(inp); return (error); @@ -1349,13 +1350,11 @@ int tcp_ctloutput(struct socket *so, struct sockopt *sopt) { - int error, opt, optval; - u_int ui; + int error; struct inpcb *inp; struct tcpcb *tp; - struct tcp_info ti; - char buf[TCP_CA_NAME_MAX]; - struct cc_algo *algo; + struct tcp_function_block *blk; + struct tcp_function_set fsn; error = 0; inp = sotoinpcb(so); @@ -1383,7 +1382,83 @@ INP_WUNLOCK(inp); return (ECONNRESET); } + tp = intotcpcb(inp); + /* + * Protect the TCP option TCP_FUNCTION_BLK so + * that a sub-function can *never* overwrite this. + */ + if ((sopt->sopt_dir == SOPT_SET) && + (sopt->sopt_name == TCP_FUNCTION_BLK)) { + INP_WUNLOCK(inp); + error = sooptcopyin(sopt, &fsn, sizeof fsn, + sizeof fsn); + if (error) + return (error); + INP_WLOCK_RECHECK(inp); + if (tp->t_state != TCPS_CLOSED) { + /* + * The user has advanced the state + * past the initial point, we can't + * switch since we are down the road + * and a new set of functions may + * not be compatibile. + */ + INP_WUNLOCK(inp); + return(EINVAL); + } + blk = find_and_ref_tcp_functions(&fsn); + if (blk == NULL) { + INP_WUNLOCK(inp); + return (ENOENT); + } + if (tp->t_fb != blk) { + if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) { + refcount_release(&blk->tfb_refcnt); + INP_WUNLOCK(inp); + return (ENOENT); + } + /* + * Release the old refcnt, the + * lookup acquires a ref on the + * new one. + */ + if (tp->t_fb->tfb_tcp_fb_fini) + (*tp->t_fb->tfb_tcp_fb_fini)(tp); + refcount_release(&tp->t_fb->tfb_refcnt); + tp->t_fb = blk; + if (tp->t_fb->tfb_tcp_fb_init) { + (*tp->t_fb->tfb_tcp_fb_init)(tp); + } + } +#ifdef TCP_OFFLOAD + if (tp->t_flags & TF_TOE) { + tcp_offload_ctloutput(tp, sopt->sopt_dir, + sopt->sopt_name); + } +#endif + INP_WUNLOCK(inp); + return (error); + } else if ((sopt->sopt_dir == SOPT_GET) && + (sopt->sopt_name == TCP_FUNCTION_BLK)) { + strcpy(fsn.function_set_name, tp->t_fb->tfb_tcp_block_name); + fsn.pcbcnt = tp->t_fb->tfb_refcnt; + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, &fsn, sizeof fsn); + return (error); + } + /* Pass in the INP locked, called must unlock it */ + return (tp->t_fb->tfb_tcp_ctloutput(so, sopt, inp, tp)); +} +int +tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) +{ + int error, opt, optval; + u_int ui; + struct tcp_info ti; + struct cc_algo *algo; + char buf[TCP_CA_NAME_MAX]; + switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { @@ -1451,7 +1526,7 @@ else if (tp->t_flags & TF_NOPUSH) { tp->t_flags &= ~TF_NOPUSH; if (TCPS_HAVEESTABLISHED(tp->t_state)) - error = tcp_output(tp); + error = tp->t_fb->tfb_tcp_output(tp); } goto unlock_and_done; @@ -1770,7 +1845,7 @@ sbflush(&so->so_rcv); tcp_usrclosed(tp); if (!(inp->inp_flags & INP_DROPPED)) - tcp_output(tp); + tp->t_fb->tfb_tcp_output(tp); } } Index: sys/netinet/tcp_var.h =================================================================== --- sys/netinet/tcp_var.h +++ sys/netinet/tcp_var.h @@ -88,6 +88,48 @@ #define tcp6cb tcpcb /* for KAME src sync over BSD*'s */ +/* + * TODO: We yet need to brave plowing in + * to tcp_input() and the pru_usrreq() block. + * Right now these go to the old standards which + * are somewhat ok, but in the long term may + * need to be changed. If we do tackle tcp_input() + * then we need to get rid of the tcp_do_segment() + * function below. + */ +/* Flags for tcp functions */ +#define TCP_FUNC_BEING_REMOVED 0x01 /* Can no longer be referenced */ + +struct tcp_function_block { + char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX]; + int (*tfb_tcp_output)(struct tcpcb *); + void (*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *, + struct socket *, struct tcpcb *, + int, int, uint8_t, + int); + int (*tfb_tcp_ctloutput)(struct socket *so, struct sockopt *sopt, + struct inpcb *inp, struct tcpcb *tp); + /* Optional memory allocation/free routine */ + void (*tfb_tcp_fb_init)(struct tcpcb *); + void (*tfb_tcp_fb_fini)(struct tcpcb *); + /* Optional timers, must define all if you define one */ + int (*tfb_tcp_timer_stop_all)(struct tcpcb *); + int (*tfb_tcp_timers_left)(struct tcpcb *); + void (*tfb_tcp_timer_activate)(struct tcpcb *, + uint32_t, u_int); + int (*tfb_tcp_timer_active)(struct tcpcb *, uint32_t); + void (*tfb_tcp_timer_stop)(struct tcpcb *, uint32_t); + volatile uint32_t tfb_refcnt; + uint32_t tfb_flags; +}; + +struct tcp_function { + TAILQ_ENTRY(tcp_function) tf_next; + struct tcp_function_block *tf_fb; +}; + +TAILQ_HEAD(tcp_funchead, tcp_function); + /* * Tcp control block, one per tcp; fields: * Organized for 16 byte cacheline efficiency. @@ -206,9 +248,10 @@ u_int t_tsomaxsegsize; /* TSO maximum segment size in bytes */ u_int t_pmtud_saved_maxopd; /* pre-blackhole MSS */ u_int t_flags2; /* More tcpcb flags storage */ - uint32_t t_ispare[8]; /* 5 UTO, 3 TBD */ - void *t_pspare2[4]; /* 1 TCP_SIGNATURE, 3 TBD */ + struct tcp_function_block *t_fb;/* TCP function call block */ + void *t_fb_ptr; /* Pointer to t_fb specific data */ + void *t_pspare2[2]; /* 1 TCP_SIGNATURE, 1 TBD */ #if defined(_KERNEL) && defined(TCPPCAP) struct mbufq t_inpkts; /* List of saved input packets. */ struct mbufq t_outpkts; /* List of saved output packets. */ @@ -534,6 +577,8 @@ #define tcps_rcvmemdrop tcps_rcvreassfull /* compat */ #ifdef _KERNEL +#define TI_UNLOCKED 1 +#define TI_RLOCKED 2 #include VNET_PCPUSTAT_DECLARE(struct tcpstat, tcpstat); /* tcp statistics */ @@ -684,7 +729,32 @@ int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *); void tcp_reass_global_init(void); void tcp_reass_flush(struct tcpcb *); +void tcp_dooptions(struct tcpopt *, u_char *, int, int); +void tcp_dropwithreset(struct mbuf *, struct tcphdr *, + struct tcpcb *, int, int); +void tcp_pulloutofband(struct socket *, + struct tcphdr *, struct mbuf *, int); +void tcp_xmit_timer(struct tcpcb *, int); +void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); +void cc_ack_received(struct tcpcb *tp, struct tcphdr *th, + uint16_t type); +void cc_conn_init(struct tcpcb *tp); +void cc_post_recovery(struct tcpcb *tp, struct tcphdr *th); +void cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type); +void hhook_run_tcp_est_in(struct tcpcb *tp, + struct tcphdr *th, struct tcpopt *to); + int tcp_input(struct mbuf **, int *, int); +void tcp_do_segment(struct mbuf *, struct tcphdr *, + struct socket *, struct tcpcb *, int, int, uint8_t, + int); + +int register_tcp_functions(struct tcp_function_block *blk, int wait); +int deregister_tcp_functions(struct tcp_function_block *blk); +struct tcp_function_block *find_and_ref_tcp_functions(struct tcp_function_set *fs); +struct tcp_function_block *find_and_ref_tcp_fb(struct tcp_function_block *blk); +int tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp); + u_long tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *); u_long tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *); void tcp_mss_update(struct tcpcb *, int, int, struct hc_metrics_lite *, Index: sys/netinet/toecore.c =================================================================== --- sys/netinet/toecore.c +++ sys/netinet/toecore.c @@ -509,7 +509,7 @@ KASSERT(!(tp->t_flags & TF_TOE), ("%s: tp %p still offloaded.", __func__, tp)); tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); } else { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); Index: usr.sbin/Makefile =================================================================== --- usr.sbin/Makefile +++ usr.sbin/Makefile @@ -84,6 +84,7 @@ spray \ syslogd \ sysrc \ + tcp_function_ctrl \ tcpdrop \ tcpdump \ traceroute \ Index: usr.sbin/tcp_function_ctrl/Makefile =================================================================== --- usr.sbin/tcp_function_ctrl/Makefile +++ usr.sbin/tcp_function_ctrl/Makefile @@ -0,0 +1,12 @@ +# @(#)Makefile 8.1 (Berkeley) 6/9/93 +# $FreeBSD$ + +PROG= tcp_function_ctrl +MAN= tcp_function_ctrl.8 +SRCS= tcp_function_ctrl.c +CFLAGS+= -Wall -Werror + +BINDIR= /usr/bin + +.include + Index: usr.sbin/tcp_function_ctrl/tcp_function_ctrl.8 =================================================================== --- usr.sbin/tcp_function_ctrl/tcp_function_ctrl.8 +++ usr.sbin/tcp_function_ctrl/tcp_function_ctrl.8 @@ -0,0 +1,73 @@ +.\" Copyright (c) 2015 +.\" Netflix Inc. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd Oct 31, 2015 +.Dt TCP_FUNCTION_CTRL 8 +.Os +.Sh NAME +.Nm tcp_function_control +.Nd Control and list TCP function sets +.Sh SYNOPSIS +.Nm +.Fl s Ar name +.Nm +.Fl l +.Sh DESCRIPTION +The +.Nm +program is designed to help you list and control what TCP functions +are available and in use on your system. The system has a default +function set that contain the standard FreeBSD TCP functions. Users +may load kernel loadable modules which add different TCP behaviors +through new functionality. Each set of functionality is named by +the designer. Once the module is installed it makes available new +functionality. You may use the +.Nm +progam to list, enquire which function set is the default, or set +a new default function set. The default function set is the one +that any new TCP connection will use. Users may also use the +TCP_FUNCTION_BLK socket option to change a specific TCP socket +at startup. +By default the program (with no arguments) displays for you the +default tcp function block (normally named default). +.Sh OPTIONS +The following options are available: +.Bl -tag -width indent +.It Fl l +List out the current TCP function sets installed on this machine and +give an account of the number of PCB using it. The Default function +block will be indicated by the column with the star. +.It Fl s Ar name +Set the default TCP function set to name. +.Sh SEE ALSO +.Xr tcp 4 +.Sh HISTORY +The +.Nm +utility first appeared in +.Fx 11.0. +.Sh AUTHORS +.An Randall Stewart Aq Mt rrs@FreeBSD.org Index: usr.sbin/tcp_function_ctrl/tcp_function_ctrl.c =================================================================== --- usr.sbin/tcp_function_ctrl/tcp_function_ctrl.c +++ usr.sbin/tcp_function_ctrl/tcp_function_ctrl.c @@ -0,0 +1,101 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int +main(int argc, char **argv) +{ + struct tcp_function_set *set; + char def[TCP_FUNCTION_NAME_LEN_MAX]; + size_t sz; + int num, i; + char *name=NULL; + int list_functions = 0; + + while((i = getopt(argc, argv, "s:l?h")) != -1) { + switch(i) { + case 's': + name = optarg; + if (strlen(name) >= 32) { + printf("Name to large\n"); + goto use; + } + break; + case 'l': + list_functions = 1; + break; + case 'h': + case '?': + use: + printf("Use %s -s name -l\n", argv[0]); + printf(" -s name will set the tcp default function\n"); + printf(" -l will list out all tcp fuctions, counts and the default(*)\n"); + printf(" With no arguments display the default function set\n"); + return(1); + break; + } + } + if ((name != NULL) && (list_functions == 1)) { + printf("Options are mutually exclusive please specify only one\n"); + goto use; + } + sz = sizeof(def); + if (sysctlbyname("net.inet.tcp.functions_default", &def, &sz, + NULL, 0)) { + err(-1, "Error retrieving name of TCP default function set"); + } + if (list_functions) { + sz = 0; + if (sysctlbyname("net.inet.tcp.functions_list_detail", NULL, &sz, + NULL, 0)) { + err(-1, "Error retrieving list from the kernel for size calculation"); + } + sz += sizeof(struct tcp_function_set); + set = malloc(sz); + if (set == NULL) { + err(-1, "Error allocating memory for list"); + } + memset(set, 0, sz); + if (sysctlbyname("net.inet.tcp.functions_list_detail", set, &sz, + NULL, 0)) { + err(-1, "Error retrieving list from the kernel"); + } + num = sz/sizeof(struct tcp_function_set); + printf("The Following TCP function sets are available\n"); + printf("Stack D PCB count\n"); + for(i=0; i