Index: head/cddl/lib/libdtrace/tcp.d =================================================================== --- head/cddl/lib/libdtrace/tcp.d (revision 357857) +++ head/cddl/lib/libdtrace/tcp.d (revision 357858) @@ -1,418 +1,418 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END * * $FreeBSD$ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 Mark Johnston */ #pragma D depends_on library ip.d #pragma D depends_on module kernel #pragma D depends_on provider tcp /* * Convert a TCP state value to a string. */ #pragma D binding "1.6.3" TCPS_CLOSED inline int TCPS_CLOSED = 0; #pragma D binding "1.6.3" TCPS_LISTEN inline int TCPS_LISTEN = 1; #pragma D binding "1.6.3" TCPS_SYN_SENT inline int TCPS_SYN_SENT = 2; #pragma D binding "1.6.3" TCPS_SYN_RECEIVED inline int TCPS_SYN_RECEIVED = 3; #pragma D binding "1.6.3" TCPS_ESTABLISHED inline int TCPS_ESTABLISHED = 4; #pragma D binding "1.6.3" TCPS_CLOSE_WAIT inline int TCPS_CLOSE_WAIT = 5; #pragma D binding "1.6.3" TCPS_FIN_WAIT_1 inline int TCPS_FIN_WAIT_1 = 6; #pragma D binding "1.6.3" TCPS_CLOSING inline int TCPS_CLOSING = 7; #pragma D binding "1.6.3" TCPS_LAST_ACK inline int TCPS_LAST_ACK = 8; #pragma D binding "1.6.3" TCPS_FIN_WAIT_2 inline int TCPS_FIN_WAIT_2 = 9; #pragma D binding "1.6.3" TCPS_TIME_WAIT inline int TCPS_TIME_WAIT = 10; /* * For compatibility also provide the names used by Solaris. */ #pragma D binding "1.13" TCP_STATE_CLOSED inline int TCP_STATE_CLOSED = TCPS_CLOSED; #pragma D binding "1.13" TCP_STATE_LISTEN inline int TCP_STATE_LISTEN = TCPS_LISTEN; #pragma D binding "1.13" TCP_STATE_SYN_SENT inline int TCP_STATE_SYN_SENT = TCPS_SYN_SENT; #pragma D binding "1.13" TCP_STATE_SYN_RECEIVED inline int TCP_STATE_SYN_RECEIVED = TCPS_SYN_RECEIVED; #pragma D binding "1.13" TCP_STATE_ESTABLISHED inline int TCP_STATE_ESTABLISHED = TCPS_ESTABLISHED; #pragma D binding "1.13" TCP_STATE_CLOSE_WAIT inline int TCP_STATE_CLOSE_WAIT = TCPS_CLOSE_WAIT; #pragma D binding "1.13" TCP_STATE_FIN_WAIT_1 inline int TCP_STATE_FIN_WAIT_1 = TCPS_FIN_WAIT_1; #pragma D binding "1.13" TCP_STATE_CLOSING inline int TCP_STATE_CLOSING = TCPS_CLOSING; #pragma D binding "1.13" TCP_STATE_LAST_ACK inline int TCP_STATE_LAST_ACK = TCPS_LAST_ACK; #pragma D binding "1.13" TCP_STATE_FIN_WAIT_2 inline int TCP_STATE_FIN_WAIT_2 = TCPS_FIN_WAIT_2; #pragma D binding "1.13" TCP_STATE_TIME_WAIT inline int TCP_STATE_TIME_WAIT = TCPS_TIME_WAIT; /* TCP segment flags. */ #pragma D binding "1.6.3" TH_FIN inline uint8_t TH_FIN = 0x01; #pragma D binding "1.6.3" TH_SYN inline uint8_t TH_SYN = 0x02; #pragma D binding "1.6.3" TH_RST inline uint8_t TH_RST = 0x04; #pragma D binding "1.6.3" TH_PUSH inline uint8_t TH_PUSH = 0x08; #pragma D binding "1.6.3" TH_ACK inline uint8_t TH_ACK = 0x10; #pragma D binding "1.6.3" TH_URG inline uint8_t TH_URG = 0x20; #pragma D binding "1.6.3" TH_ECE inline uint8_t TH_ECE = 0x40; #pragma D binding "1.6.3" TH_CWR inline uint8_t TH_CWR = 0x80; /* TCP connection state strings. */ #pragma D binding "1.6.3" tcp_state_string inline string tcp_state_string[int32_t state] = state == TCPS_CLOSED ? "state-closed" : state == TCPS_LISTEN ? "state-listen" : state == TCPS_SYN_SENT ? "state-syn-sent" : state == TCPS_SYN_RECEIVED ? "state-syn-received" : state == TCPS_ESTABLISHED ? "state-established" : state == TCPS_CLOSE_WAIT ? "state-close-wait" : state == TCPS_FIN_WAIT_1 ? "state-fin-wait-1" : state == TCPS_CLOSING ? "state-closing" : state == TCPS_LAST_ACK ? "state-last-ack" : state == TCPS_FIN_WAIT_2 ? "state-fin-wait-2" : state == TCPS_TIME_WAIT ? "state-time-wait" : ""; /* * tcpsinfo contains stable TCP details from tcp_t. */ typedef struct tcpsinfo { uintptr_t tcps_addr; int tcps_local; /* is delivered locally, boolean */ int tcps_active; /* active open (from here), boolean */ uint16_t tcps_lport; /* local port */ uint16_t tcps_rport; /* remote port */ string tcps_laddr; /* local address, as a string */ string tcps_raddr; /* remote address, as a string */ int32_t tcps_state; /* TCP state */ uint32_t tcps_iss; /* Initial sequence # sent */ uint32_t tcps_irs; /* Initial sequence # received */ uint32_t tcps_suna; /* sequence # sent but unacked */ uint32_t tcps_smax; /* highest sequence number sent */ uint32_t tcps_snxt; /* next sequence # to send */ uint32_t tcps_rack; /* sequence # we have acked */ uint32_t tcps_rnxt; /* next sequence # expected */ u_long tcps_swnd; /* send window size */ int32_t tcps_snd_ws; /* send window scaling */ uint32_t tcps_swl1; /* window update seg seq number */ uint32_t tcps_swl2; /* window update seg ack number */ uint32_t tcps_rup; /* receive urgent pointer */ uint32_t tcps_radv; /* advertised window */ u_long tcps_rwnd; /* receive window size */ int32_t tcps_rcv_ws; /* receive window scaling */ u_long tcps_cwnd; /* congestion window */ u_long tcps_cwnd_ssthresh; /* threshold for congestion avoidance */ uint32_t tcps_srecover; /* for use in NewReno Fast Recovery */ uint32_t tcps_sack_fack; /* SACK sequence # we have acked */ uint32_t tcps_sack_snxt; /* next SACK seq # for retransmission */ uint32_t tcps_rto; /* round-trip timeout, msec */ uint32_t tcps_mss; /* max segment size */ int tcps_retransmit; /* retransmit send event, boolean */ int tcps_srtt; /* smoothed RTT in units of (TCP_RTT_SCALE*hz) */ int tcps_debug; /* socket has SO_DEBUG set */ int tcps_cookie; /* expose the socket's SO_USER_COOKIE */ int32_t tcps_dupacks; /* consecutive dup acks received */ uint32_t tcps_rtttime; /* RTT measurement start time */ uint32_t tcps_rtseq; /* sequence # being timed */ uint32_t tcps_ts_recent; /* timestamp echo data */ } tcpsinfo_t; /* * tcplsinfo provides the old tcp state for state changes. */ typedef struct tcplsinfo { int32_t tcps_state; /* previous TCP state */ } tcplsinfo_t; /* * tcpinfo is the TCP header fields. */ typedef struct tcpinfo { uint16_t tcp_sport; /* source port */ uint16_t tcp_dport; /* destination port */ uint32_t tcp_seq; /* sequence number */ uint32_t tcp_ack; /* acknowledgment number */ uint8_t tcp_offset; /* data offset, in bytes */ uint8_t tcp_flags; /* flags */ uint16_t tcp_window; /* window size */ uint16_t tcp_checksum; /* checksum */ uint16_t tcp_urgent; /* urgent data pointer */ struct tcphdr *tcp_hdr; /* raw TCP header */ } tcpinfo_t; /* * A clone of tcpinfo_t used to handle the fact that the TCP input path * overwrites some fields of the TCP header with their host-order equivalents. * Unfortunately, DTrace doesn't let us simply typedef a new name for struct * tcpinfo and define a separate translator for it. */ typedef struct tcpinfoh { uint16_t tcp_sport; /* source port */ uint16_t tcp_dport; /* destination port */ uint32_t tcp_seq; /* sequence number */ uint32_t tcp_ack; /* acknowledgment number */ uint8_t tcp_offset; /* data offset, in bytes */ uint8_t tcp_flags; /* flags */ uint16_t tcp_window; /* window size */ uint16_t tcp_checksum; /* checksum */ uint16_t tcp_urgent; /* urgent data pointer */ struct tcphdr *tcp_hdr; /* raw TCP header */ } tcpinfoh_t; #pragma D binding "1.6.3" translator translator csinfo_t < struct tcpcb *p > { cs_addr = NULL; cs_cid = (uint64_t)(p == NULL ? 0 : p->t_inpcb); cs_pid = 0; cs_zoneid = 0; }; #pragma D binding "1.6.3" translator translator tcpsinfo_t < struct tcpcb *p > { tcps_addr = (uintptr_t)p; tcps_local = -1; /* XXX */ tcps_active = -1; /* XXX */ tcps_lport = p == NULL ? 0 : ntohs(p->t_inpcb->inp_inc.inc_ie.ie_lport); tcps_rport = p == NULL ? 0 : ntohs(p->t_inpcb->inp_inc.inc_ie.ie_fport); tcps_laddr = p == NULL ? "" : p->t_inpcb->inp_vflag == INP_IPV4 ? inet_ntoa(&p->t_inpcb->inp_inc.inc_ie.ie_dependladdr.id46_addr.ia46_addr4.s_addr) : inet_ntoa6(&p->t_inpcb->inp_inc.inc_ie.ie_dependladdr.id6_addr); tcps_raddr = p == NULL ? "" : p->t_inpcb->inp_vflag == INP_IPV4 ? inet_ntoa(&p->t_inpcb->inp_inc.inc_ie.ie_dependfaddr.id46_addr.ia46_addr4.s_addr) : inet_ntoa6(&p->t_inpcb->inp_inc.inc_ie.ie_dependfaddr.id6_addr); tcps_state = p == NULL ? -1 : p->t_state; tcps_iss = p == NULL ? 0 : p->iss; tcps_irs = p == NULL ? 0 : p->irs; tcps_suna = p == NULL ? 0 : p->snd_una; tcps_smax = p == NULL ? 0 : p->snd_max; tcps_snxt = p == NULL ? 0 : p->snd_nxt; tcps_rack = p == NULL ? 0 : p->last_ack_sent; tcps_rnxt = p == NULL ? 0 : p->rcv_nxt; tcps_swnd = p == NULL ? -1 : p->snd_wnd; tcps_snd_ws = p == NULL ? -1 : p->snd_scale; tcps_swl1 = p == NULL ? -1 : p->snd_wl1; tcps_swl2 = p == NULL ? -1 : p->snd_wl2; tcps_radv = p == NULL ? -1 : p->rcv_adv; tcps_rwnd = p == NULL ? -1 : p->rcv_wnd; tcps_rup = p == NULL ? -1 : p->rcv_up; tcps_rcv_ws = p == NULL ? -1 : p->rcv_scale; tcps_cwnd = p == NULL ? -1 : p->snd_cwnd; tcps_cwnd_ssthresh = p == NULL ? -1 : p->snd_ssthresh; tcps_srecover = p == NULL ? -1 : p->snd_recover; tcps_sack_fack = p == NULL ? 0 : p->snd_fack; - tcps_sack_snxt = p == NULL ? 0 : p->sack_newdata; + tcps_sack_snxt = p == NULL ? 0 : p->snd_recover; tcps_rto = p == NULL ? -1 : (p->t_rxtcur * 1000) / `hz; tcps_mss = p == NULL ? -1 : p->t_maxseg; tcps_retransmit = p == NULL ? -1 : p->t_rxtshift > 0 ? 1 : 0; tcps_srtt = p == NULL ? -1 : p->t_srtt; /* smoothed RTT in units of (TCP_RTT_SCALE*hz) */ tcps_debug = p == NULL ? 0 : p->t_inpcb->inp_socket->so_options & 1; tcps_cookie = p == NULL ? -1 : p->t_inpcb->inp_socket->so_user_cookie; tcps_dupacks = p == NULL ? -1 : p->t_dupacks; tcps_rtttime = p == NULL ? -1 : p->t_rtttime; tcps_rtseq = p == NULL ? -1 : p->t_rtseq; tcps_ts_recent = p == NULL ? -1 : p->ts_recent; }; #pragma D binding "1.6.3" translator translator tcpinfo_t < struct tcphdr *p > { tcp_sport = p == NULL ? 0 : ntohs(p->th_sport); tcp_dport = p == NULL ? 0 : ntohs(p->th_dport); tcp_seq = p == NULL ? -1 : ntohl(p->th_seq); tcp_ack = p == NULL ? -1 : ntohl(p->th_ack); tcp_offset = p == NULL ? -1 : (p->th_off >> 2); tcp_flags = p == NULL ? 0 : p->th_flags; tcp_window = p == NULL ? 0 : ntohs(p->th_win); tcp_checksum = p == NULL ? 0 : ntohs(p->th_sum); tcp_urgent = p == NULL ? 0 : ntohs(p->th_urp); tcp_hdr = (struct tcphdr *)p; }; /* * This translator differs from the one for tcpinfo_t in that the sequence * number, acknowledgement number, window size and urgent pointer are already * in host order and thus don't need to be converted. */ #pragma D binding "1.6.3" translator translator tcpinfoh_t < struct tcphdr *p > { tcp_sport = p == NULL ? 0 : ntohs(p->th_sport); tcp_dport = p == NULL ? 0 : ntohs(p->th_dport); tcp_seq = p == NULL ? -1 : p->th_seq; tcp_ack = p == NULL ? -1 : p->th_ack; tcp_offset = p == NULL ? -1 : (p->th_off >> 2); tcp_flags = p == NULL ? 0 : p->th_flags; tcp_window = p == NULL ? 0 : p->th_win; tcp_checksum = p == NULL ? 0 : ntohs(p->th_sum); tcp_urgent = p == NULL ? 0 : p->th_urp; tcp_hdr = (struct tcphdr *)p; }; #pragma D binding "1.6.3" translator translator tcplsinfo_t < int s > { tcps_state = s; }; /* Support for TCP debug */ #pragma D binding "1.12.1" TA_INPUT inline int TA_INPUT = 0; #pragma D binding "1.12.1" TA_OUTPUT inline int TA_OUTPUT = 1; #pragma D binding "1.12.1" TA_USER inline int TA_USER = 2; #pragma D binding "1.12.1" TA_RESPOND inline int TA_RESPOND = 3; #pragma D binding "1.12.1" TA_DROP inline int TA_DROP = 4; /* direction strings. */ #pragma D binding "1.12.1" tcpdebug_dir_string inline string tcpdebug_dir_string[uint8_t direction] = direction == TA_INPUT ? "input" : direction == TA_OUTPUT ? "output" : direction == TA_USER ? "user" : direction == TA_RESPOND ? "respond" : direction == TA_OUTPUT ? "drop" : "unknown" ; #pragma D binding "1.12.1" tcpflag_string inline string tcpflag_string[uint8_t flags] = flags & TH_FIN ? "FIN" : flags & TH_SYN ? "SYN" : flags & TH_RST ? "RST" : flags & TH_PUSH ? "PUSH" : flags & TH_ACK ? "ACK" : flags & TH_URG ? "URG" : flags & TH_ECE ? "ECE" : flags & TH_CWR ? "CWR" : "unknown" ; #pragma D binding "1.12.1" PRU_ATTACH inline int PRU_ATTACH = 0; #pragma D binding "1.12.1" PRU_DETACH inline int PRU_DETACH = 1; #pragma D binding "1.12.1" PRU_BIND inline int PRU_BIND = 2; #pragma D binding "1.12.1" PRU_LISTEN inline int PRU_LISTEN = 3; #pragma D binding "1.12.1" PRU_CONNECT inline int PRU_CONNECT = 4; #pragma D binding "1.12.1" PRU_ACCEPT inline int PRU_ACCEPT = 5 ; #pragma D binding "1.12.1" PRU_DISCONNECT inline int PRU_DISCONNECT = 6; #pragma D binding "1.12.1" PRU_SHUTDOWN inline int PRU_SHUTDOWN = 7; #pragma D binding "1.12.1" PRU_RCVD inline int PRU_RCVD = 8; #pragma D binding "1.12.1" PRU_SEND inline int PRU_SEND = 9; #pragma D binding "1.12.1" PRU_ABORT inline int PRU_ABORT = 10; #pragma D binding "1.12.1" PRU_CONTROL inline int PRU_CONTROL = 11; #pragma D binding "1.12.1" PRU_SENSE inline int PRU_SENSE = 12; #pragma D binding "1.12.1" PRU_RCVOOB inline int PRU_RCVOOB = 13; #pragma D binding "1.12.1" PRU_SENDOOB inline int PRU_SENDOOB = 14; #pragma D binding "1.12.1" PRU_SOCKADDR inline int PRU_SOCKADDR = 15; #pragma D binding "1.12.1" PRU_PEERADDR inline int PRU_PEERADDR = 16; #pragma D binding "1.12.1" PRU_CONNECT2 inline int PRU_CONNECT2 = 17; #pragma D binding "1.12.1" PRU_FASTTIMO inline int PRU_FASTTIMO = 18; #pragma D binding "1.12.1" PRU_SLOWTIMO inline int PRU_SLOWTIMO = 19; #pragma D binding "1.12.1" PRU_PROTORCV inline int PRU_PROTORCV = 20; #pragma D binding "1.12.1" PRU_PROTOSEND inline int PRU_PROTOSEND = 21; #pragma D binding "1.12.1" PRU_SEND_EOF inline int PRU_SEND_EOF = 22; #pragma D binding "1.12.1" PRU_SOSETLABEL inline int PRU_SOSETLABEL = 23; #pragma D binding "1.12.1" PRU_CLOSE inline int PRU_CLOSE = 24; #pragma D binding "1.12.1" PRU_FLUSH inline int PRU_FLUSH = 25; #pragma D binding "1.12.1" prureq_string inline string prureq_string[uint8_t req] = req == PRU_ATTACH ? "ATTACH" : req == PRU_DETACH ? "DETACH" : req == PRU_BIND ? "BIND" : req == PRU_LISTEN ? "LISTEN" : req == PRU_CONNECT ? "CONNECT" : req == PRU_ACCEPT ? "ACCEPT" : req == PRU_DISCONNECT ? "DISCONNECT" : req == PRU_SHUTDOWN ? "SHUTDOWN" : req == PRU_RCVD ? "RCVD" : req == PRU_SEND ? "SEND" : req == PRU_ABORT ? "ABORT" : req == PRU_CONTROL ? "CONTROL" : req == PRU_SENSE ? "SENSE" : req == PRU_RCVOOB ? "RCVOOB" : req == PRU_SENDOOB ? "SENDOOB" : req == PRU_SOCKADDR ? "SOCKADDR" : req == PRU_PEERADDR ? "PEERADDR" : req == PRU_CONNECT2 ? "CONNECT2" : req == PRU_FASTTIMO ? "FASTTIMO" : req == PRU_SLOWTIMO ? "SLOWTIMO" : req == PRU_PROTORCV ? "PROTORCV" : req == PRU_PROTOSEND ? "PROTOSEND" : req == PRU_SEND ? "SEND_EOF" : req == PRU_SOSETLABEL ? "SOSETLABEL" : req == PRU_CLOSE ? "CLOSE" : req == PRU_FLUSH ? "FLUSE" : "unknown" ; Index: head/sys/netinet/tcp_input.c =================================================================== --- head/sys/netinet/tcp_input.c (revision 357857) +++ head/sys/netinet/tcp_input.c (revision 357858) @@ -1,3839 +1,3839 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * Copyright (c) 2007-2008,2010 * Swinburne University of Technology, Melbourne, Australia. * Copyright (c) 2009-2010 Lawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, by Lawrence Stewart, * James Healy and David Hayes, made possible in part by a grant from the Cisco * University Research Program Fund at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by David Hayes under sponsorship from the FreeBSD Foundation. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include #include #include #ifdef TCP_HHOOK #include #endif #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include #include #include #include #include /* before tcp_seq.h, for tcp_random18() */ #include #include #include #include #include #define TCPSTATES /* for logging */ #include #include #include #include #include #include /* required for icmp_var.h */ #include /* for ICMP_BANDLIM */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPPCAP #include #endif #include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ #ifdef TCP_OFFLOAD #include #endif #include #include #include const int tcprexmtthresh = 3; VNET_DEFINE(int, tcp_log_in_vain) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_log_in_vain), 0, "Log all incoming TCP segments to closed ports"); VNET_DEFINE(int, blackhole) = 0; #define V_blackhole VNET(blackhole) SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(blackhole), 0, "Do not send RST on segments to closed ports"); VNET_DEFINE(int, tcp_delack_enabled) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_delack_enabled), 0, "Delay ACK to try and piggyback it onto a data packet"); VNET_DEFINE(int, drop_synfin) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(drop_synfin), 0, "Drop TCP packets with SYN+FIN set"); VNET_DEFINE(int, tcp_do_newcwv) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, newcwv, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_newcwv), 0, "Enable New Congestion Window Validation per RFC7661"); VNET_DEFINE(int, tcp_do_rfc6675_pipe) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc6675_pipe, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc6675_pipe), 0, "Use calculated pipe/in-flight bytes per RFC 6675"); VNET_DEFINE(int, tcp_do_rfc3042) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3042), 0, "Enable RFC 3042 (Limited Transmit)"); VNET_DEFINE(int, tcp_do_rfc3390) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3390), 0, "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); VNET_DEFINE(int, tcp_initcwnd_segments) = 10; SYSCTL_INT(_net_inet_tcp, OID_AUTO, initcwnd_segments, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_initcwnd_segments), 0, "Slow-start flight size (initial congestion window) in number of segments"); VNET_DEFINE(int, tcp_do_rfc3465) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3465), 0, "Enable RFC 3465 (Appropriate Byte Counting)"); VNET_DEFINE(int, tcp_abc_l_var) = 2; SYSCTL_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_abc_l_var), 2, "Cap the max cwnd increment during slow-start to this number of segments"); static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN"); VNET_DEFINE(int, tcp_do_ecn) = 2; SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_ecn), 0, "TCP ECN support"); VNET_DEFINE(int, tcp_ecn_maxretries) = 1; SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ecn_maxretries), 0, "Max retries before giving up on ECN"); VNET_DEFINE(int, tcp_insecure_syn) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_syn, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_insecure_syn), 0, "Follow RFC793 instead of RFC5961 criteria for accepting SYN packets"); VNET_DEFINE(int, tcp_insecure_rst) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_insecure_rst), 0, "Follow RFC793 instead of RFC5961 criteria for accepting RST packets"); VNET_DEFINE(int, tcp_recvspace) = 1024*64; #define V_tcp_recvspace VNET(tcp_recvspace) SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_recvspace), 0, "Initial receive socket buffer size"); VNET_DEFINE(int, tcp_do_autorcvbuf) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_autorcvbuf), 0, "Enable automatic receive buffer sizing"); VNET_DEFINE(int, tcp_autorcvbuf_max) = 2*1024*1024; SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autorcvbuf_max), 0, "Max size of automatic receive buffer"); VNET_DEFINE(struct inpcbhead, tcb); #define tcb6 tcb /* for KAME src sync over BSD*'s */ VNET_DEFINE(struct inpcbinfo, tcbinfo); /* * TCP statistics are stored in an array of counter(9)s, which size matches * size of struct tcpstat. TCP running connection count is a regular array. */ VNET_PCPUSTAT_DEFINE(struct tcpstat, tcpstat); SYSCTL_VNET_PCPUSTAT(_net_inet_tcp, TCPCTL_STATS, stats, struct tcpstat, tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); VNET_DEFINE(counter_u64_t, tcps_states[TCP_NSTATES]); SYSCTL_COUNTER_U64_ARRAY(_net_inet_tcp, TCPCTL_STATES, states, CTLFLAG_RD | CTLFLAG_VNET, &VNET_NAME(tcps_states)[0], TCP_NSTATES, "TCP connection counts by TCP state"); static void tcp_vnet_init(const void *unused) { COUNTER_ARRAY_ALLOC(V_tcps_states, TCP_NSTATES, M_WAITOK); VNET_PCPUSTAT_ALLOC(tcpstat, M_WAITOK); } VNET_SYSINIT(tcp_vnet_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, tcp_vnet_init, NULL); #ifdef VIMAGE static void tcp_vnet_uninit(const void *unused) { COUNTER_ARRAY_FREE(V_tcps_states, TCP_NSTATES); VNET_PCPUSTAT_FREE(tcpstat); } VNET_SYSUNINIT(tcp_vnet_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, tcp_vnet_uninit, NULL); #endif /* VIMAGE */ /* * Kernel module interface for updating tcpstat. The argument is an index * into tcpstat treated as an array. */ void kmod_tcpstat_inc(int statnum) { counter_u64_add(VNET(tcpstat)[statnum], 1); } #ifdef TCP_HHOOK /* * Wrapper for the TCP established input helper hook. */ void hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to) { struct tcp_hhook_data hhook_data; if (V_tcp_hhh[HHOOK_TCP_EST_IN]->hhh_nhooks > 0) { hhook_data.tp = tp; hhook_data.th = th; hhook_data.to = to; hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_IN], &hhook_data, tp->osd); } } #endif /* * CC wrapper hook functions */ void cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t nsegs, uint16_t type) { #ifdef STATS int32_t gput; #endif INP_WLOCK_ASSERT(tp->t_inpcb); tp->ccv->nsegs = nsegs; tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); if ((!V_tcp_do_newcwv && (tp->snd_cwnd <= tp->snd_wnd)) || (V_tcp_do_newcwv && (tp->snd_cwnd <= tp->snd_wnd) && (tp->snd_cwnd < (tcp_compute_pipe(tp) * 2)))) tp->ccv->flags |= CCF_CWND_LIMITED; else tp->ccv->flags &= ~CCF_CWND_LIMITED; if (type == CC_ACK) { #ifdef STATS stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, ((int32_t)tp->snd_cwnd) - tp->snd_wnd); if (!IN_RECOVERY(tp->t_flags)) stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_ACKLEN, tp->ccv->bytes_this_ack / (tcp_maxseg(tp) * nsegs)); if ((tp->t_flags & TF_GPUTINPROG) && SEQ_GEQ(th->th_ack, tp->gput_ack)) { /* * Compute goodput in bits per millisecond. */ gput = (((int64_t)(th->th_ack - tp->gput_seq)) << 3) / max(1, tcp_ts_getticks() - tp->gput_ts); stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, gput); /* * XXXLAS: This is a temporary hack, and should be * chained off VOI_TCP_GPUT when stats(9) grows an API * to deal with chained VOIs. */ if (tp->t_stats_gput_prev > 0) stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_GPUT_ND, ((gput - tp->t_stats_gput_prev) * 100) / tp->t_stats_gput_prev); tp->t_flags &= ~TF_GPUTINPROG; tp->t_stats_gput_prev = gput; } #endif /* STATS */ if (tp->snd_cwnd > tp->snd_ssthresh) { tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, nsegs * V_tcp_abc_l_var * tcp_maxseg(tp)); if (tp->t_bytes_acked >= tp->snd_cwnd) { tp->t_bytes_acked -= tp->snd_cwnd; tp->ccv->flags |= CCF_ABC_SENTAWND; } } else { tp->ccv->flags &= ~CCF_ABC_SENTAWND; tp->t_bytes_acked = 0; } } if (CC_ALGO(tp)->ack_received != NULL) { /* XXXLAS: Find a way to live without this */ tp->ccv->curack = th->th_ack; CC_ALGO(tp)->ack_received(tp->ccv, type); } #ifdef STATS stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd); #endif } void cc_conn_init(struct tcpcb *tp) { struct hc_metrics_lite metrics; struct inpcb *inp = tp->t_inpcb; u_int maxseg; int rtt; INP_WLOCK_ASSERT(tp->t_inpcb); tcp_hc_get(&inp->inp_inc, &metrics); maxseg = tcp_maxseg(tp); if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { tp->t_srtt = rtt; tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; TCPSTAT_INC(tcps_usedrtt); if (metrics.rmx_rttvar) { tp->t_rttvar = metrics.rmx_rttvar; TCPSTAT_INC(tcps_usedrttvar); } else { /* default variation is +- 1 rtt */ tp->t_rttvar = tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; } TCPT_RANGESET(tp->t_rxtcur, ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, tp->t_rttmin, TCPTV_REXMTMAX); } if (metrics.rmx_ssthresh) { /* * There's some sort of gateway or interface * buffer limit on the path. Use this to set * the slow start threshold, but set the * threshold to no less than 2*mss. */ tp->snd_ssthresh = max(2 * maxseg, metrics.rmx_ssthresh); TCPSTAT_INC(tcps_usedssthresh); } /* * Set the initial slow-start flight size. * * If a SYN or SYN/ACK was lost and retransmitted, we have to * reduce the initial CWND to one segment as congestion is likely * requiring us to be cautious. */ if (tp->snd_cwnd == 1) tp->snd_cwnd = maxseg; /* SYN(-ACK) lost */ else tp->snd_cwnd = tcp_compute_initwnd(maxseg); if (CC_ALGO(tp)->conn_init != NULL) CC_ALGO(tp)->conn_init(tp->ccv); } void inline cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) { u_int maxseg; INP_WLOCK_ASSERT(tp->t_inpcb); #ifdef STATS stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type); #endif switch(type) { case CC_NDUPACK: if (!IN_FASTRECOVERY(tp->t_flags)) { tp->snd_recover = tp->snd_max; if (tp->t_flags2 & TF2_ECN_PERMIT) tp->t_flags2 |= TF2_ECN_SND_CWR; } break; case CC_ECN: if (!IN_CONGRECOVERY(tp->t_flags)) { TCPSTAT_INC(tcps_ecn_rcwnd); tp->snd_recover = tp->snd_max; if (tp->t_flags2 & TF2_ECN_PERMIT) tp->t_flags2 |= TF2_ECN_SND_CWR; } break; case CC_RTO: maxseg = tcp_maxseg(tp); tp->t_dupacks = 0; tp->t_bytes_acked = 0; EXIT_RECOVERY(tp->t_flags); tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 / maxseg) * maxseg; tp->snd_cwnd = maxseg; if (tp->t_flags2 & TF2_ECN_PERMIT) tp->t_flags2 |= TF2_ECN_SND_CWR; break; case CC_RTO_ERR: TCPSTAT_INC(tcps_sndrexmitbad); /* RTO was unnecessary, so reset everything. */ tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_recover = tp->snd_recover_prev; if (tp->t_flags & TF_WASFRECOVERY) ENTER_FASTRECOVERY(tp->t_flags); if (tp->t_flags & TF_WASCRECOVERY) ENTER_CONGRECOVERY(tp->t_flags); tp->snd_nxt = tp->snd_max; tp->t_flags &= ~TF_PREVVALID; tp->t_badrxtwin = 0; break; } if (CC_ALGO(tp)->cong_signal != NULL) { if (th != NULL) tp->ccv->curack = th->th_ack; CC_ALGO(tp)->cong_signal(tp->ccv, type); } } void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th) { INP_WLOCK_ASSERT(tp->t_inpcb); /* XXXLAS: KASSERT that we're in recovery? */ if (CC_ALGO(tp)->post_recovery != NULL) { tp->ccv->curack = th->th_ack; CC_ALGO(tp)->post_recovery(tp->ccv); } /* XXXLAS: EXIT_RECOVERY ? */ tp->t_bytes_acked = 0; } /* * Indicate whether this ack should be delayed. We can delay the ack if * following conditions are met: * - There is no delayed ack timer in progress. * - Our last ack wasn't a 0-sized window. We never want to delay * the ack that opens up a 0-sized window. * - LRO wasn't used for this segment. We make sure by checking that the * segment size is not larger than the MSS. */ #define DELAY_ACK(tp, tlen) \ ((!tcp_timer_active(tp, TT_DELACK) && \ (tp->t_flags & TF_RXWIN0SENT) == 0) && \ (tlen <= tp->t_maxseg) && \ (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) void inline cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos) { INP_WLOCK_ASSERT(tp->t_inpcb); if (CC_ALGO(tp)->ecnpkt_handler != NULL) { switch (iptos & IPTOS_ECN_MASK) { case IPTOS_ECN_CE: tp->ccv->flags |= CCF_IPHDR_CE; break; case IPTOS_ECN_ECT0: /* FALLTHROUGH */ case IPTOS_ECN_ECT1: /* FALLTHROUGH */ case IPTOS_ECN_NOTECT: tp->ccv->flags &= ~CCF_IPHDR_CE; break; } if (th->th_flags & TH_CWR) tp->ccv->flags |= CCF_TCPHDR_CWR; else tp->ccv->flags &= ~CCF_TCPHDR_CWR; CC_ALGO(tp)->ecnpkt_handler(tp->ccv); if (tp->ccv->flags & CCF_ACKNOW) { tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); tp->t_flags |= TF_ACKNOW; } } } /* * TCP input handling is split into multiple parts: * tcp6_input is a thin wrapper around tcp_input for the extended * ip6_protox[] call format in ip6_input * tcp_input handles primary segment validation, inpcb lookup and * SYN processing on listen sockets * tcp_do_segment processes the ACK and text of the segment for * establishing, established and closing connections */ #ifdef INET6 int tcp6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m; struct in6_ifaddr *ia6; struct ip6_hdr *ip6; m = *mp; if (m->m_len < *offp + sizeof(struct tcphdr)) { m = m_pullup(m, *offp + sizeof(struct tcphdr)); if (m == NULL) { *mp = m; TCPSTAT_INC(tcps_rcvshort); return (IPPROTO_DONE); } } /* * draft-itojun-ipv6-tcp-to-anycast * better place to put this in? */ ip6 = mtod(m, struct ip6_hdr *); ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */); if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { ifa_free(&ia6->ia_ifa); icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); *mp = NULL; return (IPPROTO_DONE); } if (ia6) ifa_free(&ia6->ia_ifa); *mp = m; return (tcp_input(mp, offp, proto)); } #endif /* INET6 */ int tcp_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct tcphdr *th = NULL; struct ip *ip = NULL; struct inpcb *inp = NULL; struct tcpcb *tp = NULL; struct socket *so = NULL; u_char *optp = NULL; int off0; int optlen = 0; #ifdef INET int len; uint8_t ipttl; #endif int tlen = 0, off; int drop_hdrlen; int thflags; int rstreason = 0; /* For badport_bandlim accounting purposes */ uint8_t iptos; struct m_tag *fwd_tag = NULL; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6; #else const void *ip6 = NULL; #endif /* INET6 */ struct tcpopt to; /* options in this segment */ char *s = NULL; /* address and port logging */ #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif NET_EPOCH_ASSERT(); #ifdef INET6 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; #endif off0 = *offp; m = *mp; *mp = NULL; to.to_flags = 0; TCPSTAT_INC(tcps_rcvtotal); #ifdef INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)((caddr_t)ip6 + off0); tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; else th->th_sum = in6_cksum_pseudo(ip6, tlen, IPPROTO_TCP, m->m_pkthdr.csum_data); th->th_sum ^= 0xffff; } else th->th_sum = in6_cksum(m, IPPROTO_TCP, off0, tlen); if (th->th_sum) { TCPSTAT_INC(tcps_rcvbadsum); goto drop; } /* * Be proactive about unspecified IPv6 address in source. * As we use all-zero to indicate unbounded/unconnected pcb, * unspecified IPv6 address can be used to confuse us. * * Note that packets with unspecified IPv6 destination is * already dropped in ip6_input. */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { /* XXX stat */ goto drop; } iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { /* * Get IP and TCP header together in first mbuf. * Note: IP leaves IP header in first mbuf. */ if (off0 > sizeof (struct ip)) { ip_stripoptions(m); off0 = sizeof(struct ip); } if (m->m_len < sizeof (struct tcpiphdr)) { if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == NULL) { TCPSTAT_INC(tcps_rcvshort); return (IPPROTO_DONE); } } ip = mtod(m, struct ip *); th = (struct tcphdr *)((caddr_t)ip + off0); tlen = ntohs(ip->ip_len) - off0; iptos = ip->ip_tos; if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; else th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data + tlen + IPPROTO_TCP)); th->th_sum ^= 0xffff; } else { struct ipovly *ipov = (struct ipovly *)ip; /* * Checksum extended TCP header and data. */ len = off0 + tlen; ipttl = ip->ip_ttl; bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); ipov->ih_len = htons(tlen); th->th_sum = in_cksum(m, len); /* Reset length for SDT probes. */ ip->ip_len = htons(len); /* Reset TOS bits */ ip->ip_tos = iptos; /* Re-initialization for later version check */ ip->ip_ttl = ipttl; ip->ip_v = IPVERSION; ip->ip_hl = off0 >> 2; } if (th->th_sum) { TCPSTAT_INC(tcps_rcvbadsum); goto drop; } } #endif /* INET */ /* * Check that TCP offset makes sense, * pull out TCP options and adjust length. XXX */ off = th->th_off << 2; if (off < sizeof (struct tcphdr) || off > tlen) { TCPSTAT_INC(tcps_rcvbadoff); goto drop; } tlen -= off; /* tlen is used instead of ti->ti_len */ if (off > sizeof (struct tcphdr)) { #ifdef INET6 if (isipv6) { if (m->m_len < off0 + off) { m = m_pullup(m, off0 + off); if (m == NULL) { TCPSTAT_INC(tcps_rcvshort); return (IPPROTO_DONE); } } ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)((caddr_t)ip6 + off0); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { if (m->m_len < sizeof(struct ip) + off) { if ((m = m_pullup(m, sizeof (struct ip) + off)) == NULL) { TCPSTAT_INC(tcps_rcvshort); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); th = (struct tcphdr *)((caddr_t)ip + off0); } } #endif optlen = off - sizeof (struct tcphdr); optp = (u_char *)(th + 1); } thflags = th->th_flags; /* * Convert TCP protocol specific fields to host format. */ tcp_fields_to_host(th); /* * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options. */ drop_hdrlen = off0 + off; /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ if ( #ifdef INET6 (isipv6 && (m->m_flags & M_IP6_NEXTHOP)) #ifdef INET || (!isipv6 && (m->m_flags & M_IP_NEXTHOP)) #endif #endif #if defined(INET) && !defined(INET6) (m->m_flags & M_IP_NEXTHOP) #endif ) fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); findpcb: #ifdef INET6 if (isipv6 && fwd_tag != NULL) { struct sockaddr_in6 *next_hop6; next_hop6 = (struct sockaddr_in6 *)(fwd_tag + 1); /* * Transparently forwarded. Pretend to be the destination. * Already got one like this? */ inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif, m); if (!inp) { /* * It's new. Try to find the ambushing socket. * Because we've rewritten the destination address, * any hardware-generated hash is ignored. */ inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &next_hop6->sin6_addr, next_hop6->sin6_port ? ntohs(next_hop6->sin6_port) : th->th_dport, INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif); } } else if (isipv6) { inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif, m); } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET if (fwd_tag != NULL) { struct sockaddr_in *next_hop; next_hop = (struct sockaddr_in *)(fwd_tag+1); /* * Transparently forwarded. Pretend to be the destination. * already got one like this? */ inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif, m); if (!inp) { /* * It's new. Try to find the ambushing socket. * Because we've rewritten the destination address, * any hardware-generated hash is ignored. */ inp = in_pcblookup(&V_tcbinfo, ip->ip_src, th->th_sport, next_hop->sin_addr, next_hop->sin_port ? ntohs(next_hop->sin_port) : th->th_dport, INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif); } } else inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif, m); #endif /* INET */ /* * If the INPCB does not exist then all data in the incoming * segment is discarded and an appropriate RST is sent back. * XXX MRT Send RST using which routing table? */ if (inp == NULL) { /* * Log communication attempts to ports that are not * in use. */ if ((V_tcp_log_in_vain == 1 && (thflags & TH_SYN)) || V_tcp_log_in_vain == 2) { if ((s = tcp_log_vain(NULL, th, (void *)ip, ip6))) log(LOG_INFO, "%s; %s: Connection attempt " "to closed port\n", s, __func__); } /* * When blackholing do not respond with a RST but * completely ignore the segment and drop it. */ if ((V_blackhole == 1 && (thflags & TH_SYN)) || V_blackhole == 2) goto dropunlock; rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } INP_WLOCK_ASSERT(inp); /* * While waiting for inp lock during the lookup, another thread * can have dropped the inpcb, in which case we need to loop back * and try to find a new inpcb to deliver to. */ if (inp->inp_flags & INP_DROPPED) { INP_WUNLOCK(inp); inp = NULL; goto findpcb; } if ((inp->inp_flowtype == M_HASHTYPE_NONE) && (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) && ((inp->inp_socket == NULL) || (inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) { inp->inp_flowid = m->m_pkthdr.flowid; inp->inp_flowtype = M_HASHTYPE_GET(m); } #if defined(IPSEC) || defined(IPSEC_SUPPORT) #ifdef INET6 if (isipv6 && IPSEC_ENABLED(ipv6) && IPSEC_CHECK_POLICY(ipv6, m, inp) != 0) { goto dropunlock; } #ifdef INET else #endif #endif /* INET6 */ #ifdef INET if (IPSEC_ENABLED(ipv4) && IPSEC_CHECK_POLICY(ipv4, m, inp) != 0) { goto dropunlock; } #endif /* INET */ #endif /* IPSEC */ /* * Check the minimum TTL for socket. */ if (inp->inp_ip_minttl != 0) { #ifdef INET6 if (isipv6) { if (inp->inp_ip_minttl > ip6->ip6_hlim) goto dropunlock; } else #endif if (inp->inp_ip_minttl > ip->ip_ttl) goto dropunlock; } /* * A previous connection in TIMEWAIT state is supposed to catch stray * or duplicate segments arriving late. If this segment was a * legitimate new connection attempt, the old INPCB gets removed and * we can try again to find a listening socket. * * At this point, due to earlier optimism, we may hold only an inpcb * lock, and not the inpcbinfo write lock. If so, we need to try to * acquire it, or if that fails, acquire a reference on the inpcb, * drop all locks, acquire a global write lock, and then re-acquire * the inpcb lock. We may at that point discover that another thread * has tried to free the inpcb, in which case we need to loop back * and try to find a new inpcb to deliver to. * * XXXRW: It may be time to rethink timewait locking. */ if (inp->inp_flags & INP_TIMEWAIT) { if (thflags & TH_SYN) tcp_dooptions(&to, optp, optlen, TO_SYN); /* * NB: tcp_twcheck unlocks the INP and frees the mbuf. */ if (tcp_twcheck(inp, &to, th, m, tlen)) goto findpcb; return (IPPROTO_DONE); } /* * The TCPCB may no longer exist if the connection is winding * down or it is in the CLOSED state. Either way we drop the * segment and send an appropriate response. */ tp = intotcpcb(inp); if (tp == NULL || tp->t_state == TCPS_CLOSED) { rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) { tcp_offload_input(tp, m); m = NULL; /* consumed by the TOE driver */ goto dropunlock; } #endif #ifdef MAC INP_WLOCK_ASSERT(inp); if (mac_inpcb_check_deliver(inp, m)) goto dropunlock; #endif so = inp->inp_socket; KASSERT(so != NULL, ("%s: so == NULL", __func__)); #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) { ostate = tp->t_state; #ifdef INET6 if (isipv6) { bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6)); } else #endif bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); tcp_savetcp = *th; } #endif /* TCPDEBUG */ /* * When the socket is accepting connections (the INPCB is in LISTEN * state) we look into the SYN cache if this is a new connection * attempt or the completion of a previous one. */ KASSERT(tp->t_state == TCPS_LISTEN || !(so->so_options & SO_ACCEPTCONN), ("%s: so accepting but tp %p not listening", __func__, tp)); if (tp->t_state == TCPS_LISTEN && (so->so_options & SO_ACCEPTCONN)) { struct in_conninfo inc; bzero(&inc, sizeof(inc)); #ifdef INET6 if (isipv6) { inc.inc_flags |= INC_ISIPV6; if (inp->inp_inc.inc_flags & INC_IPV6MINMTU) inc.inc_flags |= INC_IPV6MINMTU; inc.inc6_faddr = ip6->ip6_src; inc.inc6_laddr = ip6->ip6_dst; } else #endif { inc.inc_faddr = ip->ip_src; inc.inc_laddr = ip->ip_dst; } inc.inc_fport = th->th_sport; inc.inc_lport = th->th_dport; inc.inc_fibnum = so->so_fibnum; /* * Check for an existing connection attempt in syncache if * the flag is only ACK. A successful lookup creates a new * socket appended to the listen queue in SYN_RECEIVED state. */ if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { /* * Parse the TCP options here because * syncookies need access to the reflected * timestamp. */ tcp_dooptions(&to, optp, optlen, 0); /* * NB: syncache_expand() doesn't unlock * inp and tcpinfo locks. */ rstreason = syncache_expand(&inc, &to, th, &so, m); if (rstreason < 0) { /* * A failing TCP MD5 signature comparison * must result in the segment being dropped * and must not produce any response back * to the sender. */ goto dropunlock; } else if (rstreason == 0) { /* * No syncache entry or ACK was not * for our SYN/ACK. Send a RST. * NB: syncache did its own logging * of the failure cause. */ rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } tfo_socket_result: if (so == NULL) { /* * We completed the 3-way handshake * but could not allocate a socket * either due to memory shortage, * listen queue length limits or * global socket limits. Send RST * or wait and have the remote end * retransmit the ACK for another * try. */ if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Socket allocation failed due to " "limits or memory shortage, %s\n", s, __func__, V_tcp_sc_rst_sock_fail ? "sending RST" : "try again"); if (V_tcp_sc_rst_sock_fail) { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } else goto dropunlock; } /* * Socket is created in state SYN_RECEIVED. * Unlock the listen socket, lock the newly * created socket and update the tp variable. */ INP_WUNLOCK(inp); /* listen socket */ inp = sotoinpcb(so); /* * New connection inpcb is already locked by * syncache_expand(). */ INP_WLOCK_ASSERT(inp); tp = intotcpcb(inp); KASSERT(tp->t_state == TCPS_SYN_RECEIVED, ("%s: ", __func__)); /* * Process the segment and the data it * contains. tcp_do_segment() consumes * the mbuf chain and unlocks the inpcb. */ TCP_PROBE5(receive, NULL, tp, m, tp, th); tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos); return (IPPROTO_DONE); } /* * Segment flag validation for new connection attempts: * * Our (SYN|ACK) response was rejected. * Check with syncache and remove entry to prevent * retransmits. * * NB: syncache_chkrst does its own logging of failure * causes. */ if (thflags & TH_RST) { syncache_chkrst(&inc, th, m); goto dropunlock; } /* * We can't do anything without SYN. */ if ((thflags & TH_SYN) == 0) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN is missing, segment ignored\n", s, __func__); TCPSTAT_INC(tcps_badsyn); goto dropunlock; } /* * (SYN|ACK) is bogus on a listen socket. */ if (thflags & TH_ACK) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN|ACK invalid, segment rejected\n", s, __func__); syncache_badack(&inc); /* XXX: Not needed! */ TCPSTAT_INC(tcps_badsyn); rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } /* * If the drop_synfin option is enabled, drop all * segments with both the SYN and FIN bits set. * This prevents e.g. nmap from identifying the * TCP/IP stack. * XXX: Poor reasoning. nmap has other methods * and is constantly refining its stack detection * strategies. * XXX: This is a violation of the TCP specification * and was used by RFC1644. */ if ((thflags & TH_FIN) && V_drop_synfin) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN|FIN segment ignored (based on " "sysctl setting)\n", s, __func__); TCPSTAT_INC(tcps_badsyn); goto dropunlock; } /* * Segment's flags are (SYN) or (SYN|FIN). * * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored * as they do not affect the state of the TCP FSM. * The data pointed to by TH_URG and th_urp is ignored. */ KASSERT((thflags & (TH_RST|TH_ACK)) == 0, ("%s: Listen socket: TH_RST or TH_ACK set", __func__)); KASSERT(thflags & (TH_SYN), ("%s: Listen socket: TH_SYN not set", __func__)); #ifdef INET6 /* * If deprecated address is forbidden, * we do not accept SYN to deprecated interface * address to prevent any new inbound connection from * getting established. * When we do not accept SYN, we send a TCP RST, * with deprecated source address (instead of dropping * it). We compromise it as it is much better for peer * to send a RST, and RST will be the final packet * for the exchange. * * If we do not forbid deprecated addresses, we accept * the SYN packet. RFC2462 does not suggest dropping * SYN in this case. * If we decipher RFC2462 5.5.4, it says like this: * 1. use of deprecated addr with existing * communication is okay - "SHOULD continue to be * used" * 2. use of it with new communication: * (2a) "SHOULD NOT be used if alternate address * with sufficient scope is available" * (2b) nothing mentioned otherwise. * Here we fall into (2b) case as we have no choice in * our source address selection - we must obey the peer. * * The wording in RFC2462 is confusing, and there are * multiple description text for deprecated address * handling - worse, they are not exactly the same. * I believe 5.5.4 is the best one, so we follow 5.5.4. */ if (isipv6 && !V_ip6_use_deprecated) { struct in6_ifaddr *ia6; ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */); if (ia6 != NULL && (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { ifa_free(&ia6->ia_ifa); if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt to deprecated " "IPv6 address rejected\n", s, __func__); rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } if (ia6) ifa_free(&ia6->ia_ifa); } #endif /* INET6 */ /* * Basic sanity checks on incoming SYN requests: * Don't respond if the destination is a link layer * broadcast according to RFC1122 4.2.3.10, p. 104. * If it is from this socket it must be forged. * Don't respond if the source or destination is a * global or subnet broad- or multicast address. * Note that it is quite possible to receive unicast * link-layer packets with a broadcast IP address. Use * in_broadcast() to find them. */ if (m->m_flags & (M_BCAST|M_MCAST)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from broad- or multicast " "link layer address ignored\n", s, __func__); goto dropunlock; } #ifdef INET6 if (isipv6) { if (th->th_dport == th->th_sport && IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt to/from self " "ignored\n", s, __func__); goto dropunlock; } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from/to multicast " "address ignored\n", s, __func__); goto dropunlock; } } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { if (th->th_dport == th->th_sport && ip->ip_dst.s_addr == ip->ip_src.s_addr) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from/to self " "ignored\n", s, __func__); goto dropunlock; } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from/to broad- " "or multicast address ignored\n", s, __func__); goto dropunlock; } } #endif /* * SYN appears to be valid. Create compressed TCP state * for syncache. */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); tcp_dooptions(&to, optp, optlen, TO_SYN); if (syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL, iptos)) goto tfo_socket_result; /* * Entry added to syncache and mbuf consumed. * Only the listen socket is unlocked by syncache_add(). */ INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); return (IPPROTO_DONE); } else if (tp->t_state == TCPS_LISTEN) { /* * When a listen socket is torn down the SO_ACCEPTCONN * flag is removed first while connections are drained * from the accept queue in a unlock/lock cycle of the * ACCEPT_LOCK, opening a race condition allowing a SYN * attempt go through unhandled. */ goto dropunlock; } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) { tcp_dooptions(&to, optp, optlen, thflags); if ((to.to_flags & TOF_SIGNATURE) == 0) { TCPSTAT_INC(tcps_sig_err_nosigopt); goto dropunlock; } if (!TCPMD5_ENABLED() || TCPMD5_INPUT(m, th, to.to_signature) != 0) goto dropunlock; } #endif TCP_PROBE5(receive, NULL, tp, m, tp, th); /* * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later * state. tcp_do_segment() always consumes the mbuf chain, unlocks * the inpcb, and unlocks pcbinfo. */ tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos); return (IPPROTO_DONE); dropwithreset: TCP_PROBE5(receive, NULL, tp, m, tp, th); if (inp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); INP_WUNLOCK(inp); } else tcp_dropwithreset(m, th, NULL, tlen, rstreason); m = NULL; /* mbuf chain got consumed. */ goto drop; dropunlock: if (m != NULL) TCP_PROBE5(receive, NULL, tp, m, tp, th); if (inp != NULL) INP_WUNLOCK(inp); drop: INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); if (s != NULL) free(s, M_TCPLOG); if (m != NULL) m_freem(m); return (IPPROTO_DONE); } /* * Automatic sizing of receive socket buffer. Often the send * buffer size is not optimally adjusted to the actual network * conditions at hand (delay bandwidth product). Setting the * buffer size too small limits throughput on links with high * bandwidth and high delay (eg. trans-continental/oceanic links). * * On the receive side the socket buffer memory is only rarely * used to any significant extent. This allows us to be much * more aggressive in scaling the receive socket buffer. For * the case that the buffer space is actually used to a large * extent and we run out of kernel memory we can simply drop * the new segments; TCP on the sender will just retransmit it * later. Setting the buffer size too big may only consume too * much kernel memory if the application doesn't read() from * the socket or packet loss or reordering makes use of the * reassembly queue. * * The criteria to step up the receive buffer one notch are: * 1. Application has not set receive buffer size with * SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE. * 2. the number of bytes received during 1/2 of an sRTT * is at least 3/8 of the current socket buffer size. * 3. receive buffer size has not hit maximal automatic size; * * If all of the criteria are met we increaset the socket buffer * by a 1/2 (bounded by the max). This allows us to keep ahead * of slow-start but also makes it so our peer never gets limited * by our rwnd which we then open up causing a burst. * * This algorithm does two steps per RTT at most and only if * we receive a bulk stream w/o packet losses or reorderings. * Shrinking the buffer during idle times is not necessary as * it doesn't consume any memory when idle. * * TODO: Only step up if the application is actually serving * the buffer to better manage the socket buffer resources. */ int tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int tlen) { int newsize = 0; if (V_tcp_do_autorcvbuf && (so->so_rcv.sb_flags & SB_AUTOSIZE) && tp->t_srtt != 0 && tp->rfbuf_ts != 0 && TCP_TS_TO_TICKS(tcp_ts_getticks() - tp->rfbuf_ts) > ((tp->t_srtt >> TCP_RTT_SHIFT)/2)) { if (tp->rfbuf_cnt > ((so->so_rcv.sb_hiwat / 2)/ 4 * 3) && so->so_rcv.sb_hiwat < V_tcp_autorcvbuf_max) { newsize = min((so->so_rcv.sb_hiwat + (so->so_rcv.sb_hiwat/2)), V_tcp_autorcvbuf_max); } TCP_PROBE6(receive__autoresize, NULL, tp, m, tp, th, newsize); /* Start over with next RTT. */ tp->rfbuf_ts = 0; tp->rfbuf_cnt = 0; } else { tp->rfbuf_cnt += tlen; /* add up */ } return (newsize); } void tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos) { int thflags, acked, ourfinisacked, needoutput = 0, sack_changed; int rstreason, todrop, win; uint32_t tiwin; uint16_t nsegs; char *s; struct in_conninfo *inc; struct mbuf *mfree; struct tcpopt to; int tfo_syn; #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif thflags = th->th_flags; inc = &tp->t_inpcb->inp_inc; tp->sackhint.last_sack_ack = 0; sack_changed = 0; nsegs = max(1, m->m_pkthdr.lro_nsegs); NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__)); KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", __func__)); #ifdef TCPPCAP /* Save segment, if requested. */ tcp_pcap_add(th, m, &(tp->t_inpkts)); #endif TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, tlen, NULL, true); if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: " "SYN|FIN segment ignored (based on " "sysctl setting)\n", s, __func__); free(s, M_TCPLOG); } goto drop; } /* * If a segment with the ACK-bit set arrives in the SYN-SENT state * check SEQ.ACK first. */ if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } /* * Segment received on connection. * Reset idle time and keep-alive timer. * XXX: This should be done after segment * validation to ignore broken/spoofed segs. */ tp->t_rcvtime = ticks; /* * Scale up the window into a 32-bit value. * For the SYN_SENT state the scale is zero. */ tiwin = th->th_win << tp->snd_scale; #ifdef STATS stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); #endif /* * TCP ECN processing. */ if (tp->t_flags2 & TF2_ECN_PERMIT) { if (thflags & TH_CWR) { tp->t_flags2 &= ~TF2_ECN_SND_ECE; tp->t_flags |= TF_ACKNOW; } switch (iptos & IPTOS_ECN_MASK) { case IPTOS_ECN_CE: tp->t_flags2 |= TF2_ECN_SND_ECE; TCPSTAT_INC(tcps_ecn_ce); break; case IPTOS_ECN_ECT0: TCPSTAT_INC(tcps_ecn_ect0); break; case IPTOS_ECN_ECT1: TCPSTAT_INC(tcps_ecn_ect1); break; } /* Process a packet differently from RFC3168. */ cc_ecnpkt_handler(tp, th, iptos); /* Congestion experienced. */ if (thflags & TH_ECE) { cc_cong_signal(tp, th, CC_ECN); } } /* * Parse options on any incoming segment. */ tcp_dooptions(&to, (u_char *)(th + 1), (th->th_off << 2) - sizeof(struct tcphdr), (thflags & TH_SYN) ? TO_SYN : 0); #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if ((tp->t_flags & TF_SIGNATURE) != 0 && (to.to_flags & TOF_SIGNATURE) == 0) { TCPSTAT_INC(tcps_sig_err_sigopt); /* XXX: should drop? */ } #endif /* * If echoed timestamp is later than the current time, * fall back to non RFC1323 RTT calculation. Normalize * timestamp if syncookies were used when this connection * was established. */ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { to.to_tsecr -= tp->ts_offset; if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) to.to_tsecr = 0; else if (tp->t_flags & TF_PREVVALID && tp->t_badrxtwin != 0 && SEQ_LT(to.to_tsecr, tp->t_badrxtwin)) cc_cong_signal(tp, th, CC_RTO_ERR); } /* * Process options only when we get SYN/ACK back. The SYN case * for incoming connections is handled in tcp_syncache. * According to RFC1323 the window field in a SYN (i.e., a * or ) segment itself is never scaled. * XXX this is traditional behavior, may need to be cleaned up. */ if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { if ((to.to_flags & TOF_SCALE) && (tp->t_flags & TF_REQ_SCALE)) { tp->t_flags |= TF_RCVD_SCALE; tp->snd_scale = to.to_wscale; } /* * Initial send window. It will be updated with * the next incoming segment to the scaled value. */ tp->snd_wnd = th->th_win; if (to.to_flags & TOF_TS) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to.to_tsval; tp->ts_recent_age = tcp_ts_getticks(); } if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); if ((tp->t_flags & TF_SACK_PERMIT) && (to.to_flags & TOF_SACKPERM) == 0) tp->t_flags &= ~TF_SACK_PERMIT; if (IS_FASTOPEN(tp->t_flags)) { if (to.to_flags & TOF_FASTOPEN) { uint16_t mss; if (to.to_flags & TOF_MSS) mss = to.to_mss; else if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) mss = TCP6_MSS; else mss = TCP_MSS; tcp_fastopen_update_cache(tp, mss, to.to_tfo_len, to.to_tfo_cookie); } else tcp_fastopen_disable_path(tp); } } /* * If timestamps were negotiated during SYN/ACK they should * appear on every segment during this session and vice versa. */ if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp missing, " "no action\n", s, __func__); free(s, M_TCPLOG); } } if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp not expected, " "no action\n", s, __func__); free(s, M_TCPLOG); } } /* * Header prediction: check for the two common cases * of a uni-directional data xfer. If the packet has * no control flags, is in-sequence, the window didn't * change and we're not retransmitting, it's a * candidate. If the length is zero and the ack moved * forward, we're the sender side of the xfer. Just * free the data acked & wake any higher level process * that was blocked waiting for space. If the length * is non-zero and the ack didn't move, we're the * receiver side. If we're getting packets in-order * (the reassembly queue is empty), add the data to * the socket buffer and note that we need a delayed ack. * Make sure that the hidden state-flags are also off. * Since we check for TCPS_ESTABLISHED first, it can only * be TH_NEEDSYN. */ if (tp->t_state == TCPS_ESTABLISHED && th->th_seq == tp->rcv_nxt && (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && tp->snd_nxt == tp->snd_max && tiwin && tiwin == tp->snd_wnd && ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && SEGQ_EMPTY(tp) && ((to.to_flags & TOF_TS) == 0 || TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) { /* * If last ACK falls within this segment's sequence numbers, * record the timestamp. * NOTE that the test is modified according to the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to.to_tsval; } if (tlen == 0) { if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && !IN_RECOVERY(tp->t_flags) && (to.to_flags & TOF_SACK) == 0 && TAILQ_EMPTY(&tp->snd_holes)) { /* * This is a pure ack for outstanding data. */ TCPSTAT_INC(tcps_predack); /* * "bad retransmit" recovery without timestamps. */ if ((to.to_flags & TOF_TS) == 0 && tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && (int)(ticks - tp->t_badrxtwin) < 0) { cc_cong_signal(tp, th, CC_RTO_ERR); } /* * Recalculate the transmit timer / rtt. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { uint32_t t; t = tcp_ts_getticks() - to.to_tsecr; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } acked = BYTES_THIS_ACK(tp, th); #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, &to); #endif TCPSTAT_ADD(tcps_rcvackpack, nsegs); TCPSTAT_ADD(tcps_rcvackbyte, acked); sbdrop(&so->so_snd, acked); if (SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; /* * Let the congestion control algorithm update * congestion control related information. This * typically means increasing the congestion * window. */ cc_ack_received(tp, th, nsegs, CC_ACK); tp->snd_una = th->th_ack; /* * Pull snd_wl2 up to prevent seq wrap relative * to th_ack. */ tp->snd_wl2 = th->th_ack; tp->t_dupacks = 0; m_freem(m); /* * If all outstanding data are acked, stop * retransmit timer, otherwise restart timer * using current (possibly backed-off) value. * If process is waiting for space, * wakeup/selwakeup/signal. If data * are ready to send, let tcp_output * decide between more output or persist. */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); if (tp->snd_una == tp->snd_max) tcp_timer_activate(tp, TT_REXMT, 0); else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); sowwakeup(so); if (sbavail(&so->so_snd)) (void) tp->t_fb->tfb_tcp_output(tp); goto check_delack; } } else if (th->th_ack == tp->snd_una && tlen <= sbspace(&so->so_rcv)) { int newsize = 0; /* automatic sockbuf scaling */ /* * This is a pure, in-sequence data packet with * nothing on the reassembly queue and we have enough * buffer space to take it. */ /* Clean receiver SACK report if present */ if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) tcp_clean_sackreport(tp); TCPSTAT_INC(tcps_preddat); tp->rcv_nxt += tlen; /* * Pull snd_wl1 up to prevent seq wrap relative to * th_seq. */ tp->snd_wl1 = th->th_seq; /* * Pull rcv_up up to prevent seq wrap relative to * rcv_nxt. */ tp->rcv_up = tp->rcv_nxt; TCPSTAT_ADD(tcps_rcvpack, nsegs); TCPSTAT_ADD(tcps_rcvbyte, tlen); #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); newsize = tcp_autorcvbuf(m, th, so, tp, tlen); /* Add data to socket buffer. */ SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else { /* * Set new socket buffer size. * Give up when limit is reached. */ if (newsize) if (!sbreserve_locked(&so->so_rcv, newsize, so, NULL)) so->so_rcv.sb_flags &= ~SB_AUTOSIZE; m_adj(m, drop_hdrlen); /* delayed header drop */ sbappendstream_locked(&so->so_rcv, m, 0); } /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); if (DELAY_ACK(tp, tlen)) { tp->t_flags |= TF_DELACK; } else { tp->t_flags |= TF_ACKNOW; tp->t_fb->tfb_tcp_output(tp); } goto check_delack; } } /* * Calculate amount of space in receive window, * and then do TCP input processing. * Receive window is amount of space in rcv queue, * but not less than advertised window. */ win = sbspace(&so->so_rcv); if (win < 0) win = 0; tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); switch (tp->t_state) { /* * If the state is SYN_RECEIVED: * if seg contains an ACK, but not for our SYN/ACK, send a RST. */ case TCPS_SYN_RECEIVED: if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } if (IS_FASTOPEN(tp->t_flags)) { /* * When a TFO connection is in SYN_RECEIVED, the * only valid packets are the initial SYN, a * retransmit/copy of the initial SYN (possibly with * a subset of the original data), a valid ACK, a * FIN, or a RST. */ if ((thflags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } else if (thflags & TH_SYN) { /* non-initial SYN is ignored */ if ((tcp_timer_active(tp, TT_DELACK) || tcp_timer_active(tp, TT_REXMT))) goto drop; } else if (!(thflags & (TH_ACK|TH_FIN|TH_RST))) { goto drop; } } break; /* * If the state is SYN_SENT: * if seg contains a RST with valid ACK (SEQ.ACK has already * been verified), then drop the connection. * if seg contains a RST without an ACK, drop the seg. * if seg does not contain SYN, then drop the seg. * Otherwise this is an acceptable SYN segment * initialize tp->rcv_nxt and tp->irs * if seg contains ack then advance tp->snd_una * if seg contains an ECE and ECN support is enabled, the stream * is ECN capable. * if SYN has been acked change to ESTABLISHED else SYN_RCVD state * arrange for segment to be acked (eventually) * continue processing rest of data/controls, beginning with URG */ case TCPS_SYN_SENT: if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) { TCP_PROBE5(connect__refused, NULL, tp, m, tp, th); tp = tcp_drop(tp, ECONNREFUSED); } if (thflags & TH_RST) goto drop; if (!(thflags & TH_SYN)) goto drop; tp->irs = th->th_seq; tcp_rcvseqinit(tp); if (thflags & TH_ACK) { int tfo_partial_ack = 0; TCPSTAT_INC(tcps_connects); soisconnected(so); #ifdef MAC mac_socketpeer_set_from_mbuf(m, so); #endif /* Do window scaling on this connection? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; } tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale); tp->snd_una++; /* SYN is acked */ /* * If not all the data that was sent in the TFO SYN * has been acked, resend the remainder right away. */ if (IS_FASTOPEN(tp->t_flags) && (tp->snd_una != tp->snd_max)) { tp->snd_nxt = th->th_ack; tfo_partial_ack = 1; } /* * If there's data, delay ACK; if there's also a FIN * ACKNOW will be turned on later. */ if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial_ack) tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); else tp->t_flags |= TF_ACKNOW; if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) && (V_tcp_do_ecn == 1)) { tp->t_flags2 |= TF2_ECN_PERMIT; TCPSTAT_INC(tcps_ecn_shs); } /* * Received in SYN_SENT[*] state. * Transitions: * SYN_SENT --> ESTABLISHED * SYN_SENT* --> FIN_WAIT_1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(connect__established, NULL, tp, m, tp, th); cc_conn_init(tp); tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); } } else { /* * Received initial SYN in SYN-SENT[*] state => * simultaneous open. * If it succeeds, connection is * half-synchronized. * Otherwise, do 3-way handshake: * SYN-SENT -> SYN-RECEIVED * SYN-SENT* -> SYN-RECEIVED* */ tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); tcp_timer_activate(tp, TT_REXMT, 0); tcp_state_change(tp, TCPS_SYN_RECEIVED); } INP_WLOCK_ASSERT(tp->t_inpcb); /* * Advance th->th_seq to correspond to first data byte. * If data, trim to stay within window, * dropping FIN if necessary. */ th->th_seq++; if (tlen > tp->rcv_wnd) { todrop = tlen - tp->rcv_wnd; m_adj(m, -todrop); tlen = tp->rcv_wnd; thflags &= ~TH_FIN; TCPSTAT_INC(tcps_rcvpackafterwin); TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); } tp->snd_wl1 = th->th_seq - 1; tp->rcv_up = th->th_seq; /* * Client side of transaction: already sent SYN and data. * If the remote host used T/TCP to validate the SYN, * our data will be ACK'd; if so, enter normal data segment * processing in the middle of step 5, ack processing. * Otherwise, goto step 6. */ if (thflags & TH_ACK) goto process_ACK; goto step6; /* * If the state is LAST_ACK or CLOSING or TIME_WAIT: * do normal processing. * * NB: Leftover from RFC1644 T/TCP. Cases to be reused later. */ case TCPS_LAST_ACK: case TCPS_CLOSING: break; /* continue normal processing */ } /* * States other than LISTEN or SYN_SENT. * First check the RST flag and sequence number since reset segments * are exempt from the timestamp and connection count tests. This * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix * below which allowed reset segments in half the sequence space * to fall though and be processed (which gives forged reset * segments with a random sequence number a 50 percent chance of * killing a connection). * Then check timestamp, if present. * Then check the connection count, if present. * Then check that at least some bytes of segment are within * receive window. If segment begins before rcv_nxt, * drop leading data (and SYN); if nothing left, just ack. */ if (thflags & TH_RST) { /* * RFC5961 Section 3.2 * * - RST drops connection only if SEG.SEQ == RCV.NXT. * - If RST is in window, we send challenge ACK. * * Note: to take into account delayed ACKs, we should * test against last_ack_sent instead of rcv_nxt. * Note 2: we handle special case of closed window, not * covered by the RFC. */ if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { KASSERT(tp->t_state != TCPS_SYN_SENT, ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", __func__, th, tp)); if (V_tcp_insecure_rst || tp->last_ack_sent == th->th_seq) { TCPSTAT_INC(tcps_drops); /* Drop the connection. */ switch (tp->t_state) { case TCPS_SYN_RECEIVED: so->so_error = ECONNREFUSED; goto close; case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: so->so_error = ECONNRESET; close: /* FALLTHROUGH */ default: tp = tcp_close(tp); } } else { TCPSTAT_INC(tcps_badrst); /* Send challenge ACK. */ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, tp->snd_nxt, TH_ACK); tp->last_ack_sent = tp->rcv_nxt; m = NULL; } } goto drop; } /* * RFC5961 Section 4.2 * Send challenge ACK for any SYN in synchronized state. */ if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT && tp->t_state != TCPS_SYN_RECEIVED) { TCPSTAT_INC(tcps_badsyn); if (V_tcp_insecure_syn && SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { tp = tcp_drop(tp, ECONNRESET); rstreason = BANDLIM_UNLIMITED; } else { /* Send challenge ACK. */ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, tp->snd_nxt, TH_ACK); tp->last_ack_sent = tp->rcv_nxt; m = NULL; } goto drop; } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment * and it's less than ts_recent, drop it. */ if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to.to_tsval, tp->ts_recent)) { /* Check to see if ts_recent is over 24 days old. */ if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { /* * Invalidate ts_recent. If this segment updates * ts_recent, the age will be reset later and ts_recent * will get a valid value. If it does not, setting * ts_recent to zero will at least satisfy the * requirement that zero be placed in the timestamp * echo reply when ts_recent isn't valid. The * age isn't reset until we get a valid ts_recent * because we don't want out-of-order segments to be * dropped when ts_recent is old. */ tp->ts_recent = 0; } else { TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, tlen); TCPSTAT_INC(tcps_pawsdrop); if (tlen) goto dropafterack; goto drop; } } /* * In the SYN-RECEIVED state, validate that the packet belongs to * this connection before trimming the data to fit the receive * window. Check the sequence number versus IRS since we know * the sequence numbers haven't wrapped. This is a partial fix * for the "LAND" DoS attack. */ if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } todrop = tp->rcv_nxt - th->th_seq; if (todrop > 0) { if (thflags & TH_SYN) { thflags &= ~TH_SYN; th->th_seq++; if (th->th_urp > 1) th->th_urp--; else thflags &= ~TH_URG; todrop--; } /* * Following if statement from Stevens, vol. 2, p. 960. */ if (todrop > tlen || (todrop == tlen && (thflags & TH_FIN) == 0)) { /* * Any valid FIN must be to the left of the window. * At this point the FIN must be a duplicate or out * of sequence; drop it. */ thflags &= ~TH_FIN; /* * Send an ACK to resynchronize and drop any data. * But keep on processing for RST or ACK. */ tp->t_flags |= TF_ACKNOW; todrop = tlen; TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, todrop); } else { TCPSTAT_INC(tcps_rcvpartduppack); TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); } /* * DSACK - add SACK block for dropped range */ if ((todrop > 0) && (tp->t_flags & TF_SACK_PERMIT)) { tcp_update_sack_list(tp, th->th_seq, th->th_seq + todrop); /* * ACK now, as the next in-sequence segment * will clear the DSACK block again */ tp->t_flags |= TF_ACKNOW; } drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; tlen -= todrop; if (th->th_urp > todrop) th->th_urp -= todrop; else { thflags &= ~TH_URG; th->th_urp = 0; } } /* * If new data are received on a connection after the * user processes are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && tlen) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data " "after socket was closed, " "sending RST and removing tcpcb\n", s, __func__, tcpstates[tp->t_state], tlen); free(s, M_TCPLOG); } tp = tcp_close(tp); TCPSTAT_INC(tcps_rcvafterclose); rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } /* * If segment ends after window, drop trailing data * (and PUSH and FIN); if nothing left, just ACK. */ todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); if (todrop > 0) { TCPSTAT_INC(tcps_rcvpackafterwin); if (todrop >= tlen) { TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); /* * If window is closed can only take segments at * window edge, and have to drop data and PUSH from * incoming segments. Continue processing, but * remember to ack. Otherwise, drop segment * and ack. */ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_rcvwinprobe); } else goto dropafterack; } else TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); m_adj(m, -todrop); tlen -= todrop; thflags &= ~(TH_PUSH|TH_FIN); } /* * If last ACK falls within this segment's sequence numbers, * record its timestamp. * NOTE: * 1) That the test incorporates suggestions from the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). * 2) That updating only on newer timestamps interferes with * our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. * 3) That we modify the segment boundary check to be * Last.ACK.Sent <= SEG.SEQ + SEG.Len * instead of RFC1323's * Last.ACK.Sent < SEG.SEQ + SEG.Len, * This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated * Vol. 2 p.869. In such cases, we can still calculate the * RTT correctly when RCV.NXT == Last.ACK.Sent. */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN|TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to.to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN * flag is on (half-synchronized state), then queue data for * later processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_state == TCPS_SYN_RECEIVED || (tp->t_flags & TF_NEEDSYN)) { if (tp->t_state == TCPS_SYN_RECEIVED && IS_FASTOPEN(tp->t_flags)) { tp->snd_wnd = tiwin; cc_conn_init(tp); } goto step6; } else if (tp->t_flags & TF_ACKNOW) goto dropafterack; else goto drop; } /* * Ack processing. */ switch (tp->t_state) { /* * In SYN_RECEIVED state, the ack ACKs our SYN, so enter * ESTABLISHED state and continue processing. * The ACK was checked above. */ case TCPS_SYN_RECEIVED: TCPSTAT_INC(tcps_connects); soisconnected(so); /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; } tp->snd_wnd = tiwin; /* * Make transitions: * SYN-RECEIVED -> ESTABLISHED * SYN-RECEIVED* -> FIN-WAIT-1 */ tp->t_starttime = ticks; if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { tcp_fastopen_decrement_counter(tp->t_tfo_pending); tp->t_tfo_pending = NULL; /* * Account for the ACK of our SYN prior to * regular ACK processing below. */ tp->snd_una++; } if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(accept__established, NULL, tp, m, tp, th); /* * TFO connections call cc_conn_init() during SYN * processing. Calling it again here for such * connections is not harmless as it would undo the * snd_cwnd reduction that occurs when a TFO SYN|ACK * is retransmitted. */ if (!IS_FASTOPEN(tp->t_flags)) cc_conn_init(tp); tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); } /* * If segment contains data or ACK, will call tcp_reass() * later; if not, do so now to pass queued data to user. */ if (tlen == 0 && (thflags & TH_FIN) == 0) (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, (struct mbuf *)0); tp->snd_wl1 = th->th_seq - 1; /* FALLTHROUGH */ /* * In ESTABLISHED state: drop duplicate ACKs; ACK out of range * ACKs. If the ack is in the range * tp->snd_una < th->th_ack <= tp->snd_max * then advance tp->snd_una to th->th_ack and drop * data from the retransmission queue. If this ACK reflects * more up to date window information we update our window information. */ case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: if (SEQ_GT(th->th_ack, tp->snd_max)) { TCPSTAT_INC(tcps_rcvacktoomuch); goto dropafterack; } if ((tp->t_flags & TF_SACK_PERMIT) && ((to.to_flags & TOF_SACK) || !TAILQ_EMPTY(&tp->snd_holes))) sack_changed = tcp_sack_doack(tp, &to, th->th_ack); else /* * Reset the value so that previous (valid) value * from the last ack with SACK doesn't get used. */ tp->sackhint.sacked_bytes = 0; #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, &to); #endif if (SEQ_LEQ(th->th_ack, tp->snd_una)) { u_int maxseg; maxseg = tcp_maxseg(tp); if (tlen == 0 && (tiwin == tp->snd_wnd || (tp->t_flags & TF_SACK_PERMIT))) { /* * If this is the first time we've seen a * FIN from the remote, this is not a * duplicate and it needs to be processed * normally. This happens during a * simultaneous close. */ if ((thflags & TH_FIN) && (TCPS_HAVERCVDFIN(tp->t_state) == 0)) { tp->t_dupacks = 0; break; } TCPSTAT_INC(tcps_rcvdupack); /* * If we have outstanding data (other than * a window probe), this is a completely * duplicate ack (ie, window info didn't * change and FIN isn't set), * the ack is the biggest we've * seen and we've seen exactly our rexmt * threshold of them, assume a packet * has been dropped and retransmit it. * Kludge snd_nxt & the congestion * window so we send only this one * packet. * * We know we're losing at the current * window size so do congestion avoidance * (set ssthresh to half the current window * and pull our congestion window back to * the new ssthresh). * * Dup acks mean that packets have left the * network (they're now cached at the receiver) * so bump cwnd by the amount in the receiver * to keep a constant cwnd packets in the * network. * * When using TCP ECN, notify the peer that * we reduced the cwnd. */ /* * Following 2 kinds of acks should not affect * dupack counting: * 1) Old acks * 2) Acks with SACK but without any new SACK * information in them. These could result from * any anomaly in the network like a switch * duplicating packets or a possible DoS attack. */ if (th->th_ack != tp->snd_una || ((tp->t_flags & TF_SACK_PERMIT) && !sack_changed)) break; else if (!tcp_timer_active(tp, TT_REXMT)) tp->t_dupacks = 0; else if (++tp->t_dupacks > tcprexmtthresh || IN_FASTRECOVERY(tp->t_flags)) { cc_ack_received(tp, th, nsegs, CC_DUPACK); if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags)) { int awnd; /* * Compute the amount of data in flight first. * We can inject new data into the pipe iff * we have less than 1/2 the original window's * worth of data in flight. */ if (V_tcp_do_rfc6675_pipe) awnd = tcp_compute_pipe(tp); else awnd = (tp->snd_nxt - tp->snd_fack) + tp->sackhint.sack_bytes_rexmit; if (awnd < tp->snd_ssthresh) { tp->snd_cwnd += maxseg; if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; } } else tp->snd_cwnd += maxseg; (void) tp->t_fb->tfb_tcp_output(tp); goto drop; } else if (tp->t_dupacks == tcprexmtthresh) { tcp_seq onxt = tp->snd_nxt; /* * If we're doing sack, check to * see if we're already in sack * recovery. If we're not doing sack, * check to see if we're in newreno * recovery. */ if (tp->t_flags & TF_SACK_PERMIT) { if (IN_FASTRECOVERY(tp->t_flags)) { tp->t_dupacks = 0; break; } } else { if (SEQ_LEQ(th->th_ack, tp->snd_recover)) { tp->t_dupacks = 0; break; } } /* Congestion signal before ack. */ cc_cong_signal(tp, th, CC_NDUPACK); cc_ack_received(tp, th, nsegs, CC_DUPACK); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; if (tp->t_flags & TF_SACK_PERMIT) { TCPSTAT_INC( tcps_sack_recovery_episode); - tp->sack_newdata = tp->snd_nxt; + tp->snd_recover = tp->snd_nxt; tp->snd_cwnd = maxseg; (void) tp->t_fb->tfb_tcp_output(tp); goto drop; } tp->snd_nxt = th->th_ack; tp->snd_cwnd = maxseg; (void) tp->t_fb->tfb_tcp_output(tp); KASSERT(tp->snd_limited <= 2, ("%s: tp->snd_limited too big", __func__)); tp->snd_cwnd = tp->snd_ssthresh + maxseg * (tp->t_dupacks - tp->snd_limited); if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; } else if (V_tcp_do_rfc3042) { /* * Process first and second duplicate * ACKs. Each indicates a segment * leaving the network, creating room * for more. Make sure we can send a * packet on reception of each duplicate * ACK by increasing snd_cwnd by one * segment. Restore the original * snd_cwnd after packet transmission. */ cc_ack_received(tp, th, nsegs, CC_DUPACK); uint32_t oldcwnd = tp->snd_cwnd; tcp_seq oldsndmax = tp->snd_max; u_int sent; int avail; KASSERT(tp->t_dupacks == 1 || tp->t_dupacks == 2, ("%s: dupacks not 1 or 2", __func__)); if (tp->t_dupacks == 1) tp->snd_limited = 0; tp->snd_cwnd = (tp->snd_nxt - tp->snd_una) + (tp->t_dupacks - tp->snd_limited) * maxseg; /* * Only call tcp_output when there * is new data available to be sent. * Otherwise we would send pure ACKs. */ SOCKBUF_LOCK(&so->so_snd); avail = sbavail(&so->so_snd) - (tp->snd_nxt - tp->snd_una); SOCKBUF_UNLOCK(&so->so_snd); if (avail > 0) (void) tp->t_fb->tfb_tcp_output(tp); sent = tp->snd_max - oldsndmax; if (sent > maxseg) { KASSERT((tp->t_dupacks == 2 && tp->snd_limited == 0) || (sent == maxseg + 1 && tp->t_flags & TF_SENTFIN), ("%s: sent too much", __func__)); tp->snd_limited = 2; } else if (sent > 0) ++tp->snd_limited; tp->snd_cwnd = oldcwnd; goto drop; } } break; } else { /* * This ack is advancing the left edge, reset the * counter. */ tp->t_dupacks = 0; /* * If this ack also has new SACK info, increment the * counter as per rfc6675. */ if ((tp->t_flags & TF_SACK_PERMIT) && sack_changed) tp->t_dupacks++; } KASSERT(SEQ_GT(th->th_ack, tp->snd_una), ("%s: th_ack <= snd_una", __func__)); /* * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ if (IN_FASTRECOVERY(tp->t_flags)) { if (SEQ_LT(th->th_ack, tp->snd_recover)) { if (tp->t_flags & TF_SACK_PERMIT) tcp_sack_partialack(tp, th); else tcp_newreno_partial_ack(tp, th); } else cc_post_recovery(tp, th); } /* * If we reach this point, ACK is not a duplicate, * i.e., it ACKs something we sent. */ if (tp->t_flags & TF_NEEDSYN) { /* * T/TCP: Connection was half-synchronized, and our * SYN has been ACK'd (so connection is now fully * synchronized). Go to non-starred state, * increment snd_una for ACK of SYN, and check if * we can do window scaling. */ tp->t_flags &= ~TF_NEEDSYN; tp->snd_una++; /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; /* Send window already scaled. */ } } process_ACK: INP_WLOCK_ASSERT(tp->t_inpcb); acked = BYTES_THIS_ACK(tp, th); KASSERT(acked >= 0, ("%s: acked unexepectedly negative " "(tp->snd_una=%u, th->th_ack=%u, tp=%p, m=%p)", __func__, tp->snd_una, th->th_ack, tp, m)); TCPSTAT_ADD(tcps_rcvackpack, nsegs); TCPSTAT_ADD(tcps_rcvackbyte, acked); /* * If we just performed our first retransmit, and the ACK * arrives within our recovery window, then it was a mistake * to do the retransmit in the first place. Recover our * original cwnd and ssthresh, and proceed to transmit where * we left off. */ if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && tp->t_badrxtwin && SEQ_LT(to.to_tsecr, tp->t_badrxtwin)) cc_cong_signal(tp, th, CC_RTO_ERR); /* * If we have a timestamp reply, update smoothed * round trip time. If no timestamp is present but * transmit timer is running and timed sequence * number was acked, update smoothed round trip time. * Since we now have an rtt measurement, cancel the * timer backoff (cf., Phil Karn's retransmit alg.). * Recompute the initial retransmit timer. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { uint32_t t; t = tcp_ts_getticks() - to.to_tsecr; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } /* * If all outstanding data is acked, stop retransmit * timer and remember to restart (more output or persist). * If there is more data to be acked, restart retransmit * timer, using current (possibly backed-off) value. */ if (th->th_ack == tp->snd_max) { tcp_timer_activate(tp, TT_REXMT, 0); needoutput = 1; } else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); /* * If no data (only SYN) was ACK'd, * skip rest of ACK processing. */ if (acked == 0) goto step6; /* * Let the congestion control algorithm update congestion * control related information. This typically means increasing * the congestion window. */ cc_ack_received(tp, th, nsegs, CC_ACK); SOCKBUF_LOCK(&so->so_snd); if (acked > sbavail(&so->so_snd)) { if (tp->snd_wnd >= sbavail(&so->so_snd)) tp->snd_wnd -= sbavail(&so->so_snd); else tp->snd_wnd = 0; mfree = sbcut_locked(&so->so_snd, (int)sbavail(&so->so_snd)); ourfinisacked = 1; } else { mfree = sbcut_locked(&so->so_snd, acked); if (tp->snd_wnd >= (uint32_t) acked) tp->snd_wnd -= acked; else tp->snd_wnd = 0; ourfinisacked = 0; } /* NB: sowwakeup_locked() does an implicit unlock. */ sowwakeup_locked(so); m_freem(mfree); /* Detect una wraparound. */ if (!IN_RECOVERY(tp->t_flags) && SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; /* XXXLAS: Can this be moved up into cc_post_recovery? */ if (IN_RECOVERY(tp->t_flags) && SEQ_GEQ(th->th_ack, tp->snd_recover)) { EXIT_RECOVERY(tp->t_flags); } tp->snd_una = th->th_ack; if (tp->t_flags & TF_SACK_PERMIT) { if (SEQ_GT(tp->snd_una, tp->snd_recover)) tp->snd_recover = tp->snd_una; } if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; switch (tp->t_state) { /* * In FIN_WAIT_1 STATE in addition to the processing * for the ESTABLISHED state if our FIN is now acknowledged * then enter FIN_WAIT_2. */ case TCPS_FIN_WAIT_1: if (ourfinisacked) { /* * If we can't receive any more * data, then closing user can proceed. * Starting the timer is contrary to the * specification, but if we don't get a FIN * we'll hang forever. * * XXXjl: * we should release the tp also, and use a * compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); tcp_timer_activate(tp, TT_2MSL, (tcp_fast_finwait2_recycle ? tcp_finwait2_timeout : TP_MAXIDLE(tp))); } tcp_state_change(tp, TCPS_FIN_WAIT_2); } break; /* * In CLOSING STATE in addition to the processing for * the ESTABLISHED state if the ACK acknowledges our FIN * then enter the TIME-WAIT state, otherwise ignore * the segment. */ case TCPS_CLOSING: if (ourfinisacked) { tcp_twstart(tp); m_freem(m); return; } break; /* * In LAST_ACK, we may still be waiting for data to drain * and/or to be acked, as well as for the ack of our FIN. * If our FIN is now acknowledged, delete the TCB, * enter the closed state and return. */ case TCPS_LAST_ACK: if (ourfinisacked) { tp = tcp_close(tp); goto drop; } break; } } step6: INP_WLOCK_ASSERT(tp->t_inpcb); /* * Update window information. * Don't look at window if no ACK: TAC's send garbage on first SYN. */ if ((thflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ if (tlen == 0 && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) TCPSTAT_INC(tcps_rcvwinupd); tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; needoutput = 1; } /* * Process segments with URG. */ if ((thflags & TH_URG) && th->th_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* * This is a kludge, but if we receive and accept * random urgent pointers, we'll crash in * soreceive. It's hard to imagine someone * actually wanting to send this much urgent data. */ SOCKBUF_LOCK(&so->so_rcv); if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { th->th_urp = 0; /* XXX */ thflags &= ~TH_URG; /* XXX */ SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ goto dodata; /* XXX */ } /* * If this segment advances the known urgent pointer, * then mark the data stream. This should not happen * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since * a FIN has been received from the remote side. * In these states we ignore the URG. * * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section as the original * spec states (in one of two places). */ if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { tp->rcv_up = th->th_seq + th->th_urp; so->so_oobmark = sbavail(&so->so_rcv) + (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) so->so_rcv.sb_state |= SBS_RCVATMARK; sohasoutofband(so); tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); } SOCKBUF_UNLOCK(&so->so_rcv); /* * Remove out of band data so doesn't get presented to user. * This can happen independent of advancing the URG pointer, * but if two URG's are pending at once, some out-of-band * data may creep in... ick. */ if (th->th_urp <= (uint32_t)tlen && !(so->so_options & SO_OOBINLINE)) { /* hdr drop is delayed */ tcp_pulloutofband(so, th, m, drop_hdrlen); } } else { /* * If no out of band data is expected, * pull receive urgent pointer along * with the receive window. */ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) tp->rcv_up = tp->rcv_nxt; } dodata: /* XXX */ INP_WLOCK_ASSERT(tp->t_inpcb); /* * Process the segment text, merging it into the TCP sequencing queue, * and arranging for acknowledgment of receipt if necessary. * This process logically involves adjusting tp->rcv_wnd as data * is presented to the user (this happens in tcp_usrreq.c, * case PRU_RCVD). If a FIN has already been received on this * connection then we just ignore the text. */ tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && IS_FASTOPEN(tp->t_flags)); if ((tlen || (thflags & TH_FIN) || tfo_syn) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { tcp_seq save_start = th->th_seq; tcp_seq save_rnxt = tp->rcv_nxt; int save_tlen = tlen; m_adj(m, drop_hdrlen); /* delayed header drop */ /* * Insert segment which includes th into TCP reassembly queue * with control block tp. Set thflags to whether reassembly now * includes a segment with FIN. This handles the common case * inline (segment is the next to be received on an established * connection, and the queue is empty), avoiding linkage into * and removal from the queue and repetition of various * conversions. * Set DELACK for segments received in order, but ack * immediately when segments are out of order (so * fast retransmit can work). */ if (th->th_seq == tp->rcv_nxt && SEGQ_EMPTY(tp) && (TCPS_HAVEESTABLISHED(tp->t_state) || tfo_syn)) { if (DELAY_ACK(tp, tlen) || tfo_syn) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt += tlen; thflags = th->th_flags & TH_FIN; TCPSTAT_INC(tcps_rcvpack); TCPSTAT_ADD(tcps_rcvbyte, tlen); SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(m); else sbappendstream_locked(&so->so_rcv, m, 0); /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); } else { /* * XXX: Due to the header drop above "th" is * theoretically invalid by now. Fortunately * m_adj() doesn't actually frees any mbufs * when trimming from the head. */ tcp_seq temp = save_start; thflags = tcp_reass(tp, th, &temp, &tlen, m); tp->t_flags |= TF_ACKNOW; } if ((tp->t_flags & TF_SACK_PERMIT) && (save_tlen > 0)) { if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { /* * DSACK actually handled in the fastpath * above. */ tcp_update_sack_list(tp, save_start, save_start + save_tlen); } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { if ((tp->rcv_numsacks >= 1) && (tp->sackblks[0].end == save_start)) { /* * Partial overlap, recorded at todrop * above. */ tcp_update_sack_list(tp, tp->sackblks[0].start, tp->sackblks[0].end); } else { tcp_update_dsack_list(tp, save_start, save_start + save_tlen); } } else if (tlen >= save_tlen) { /* Update of sackblks. */ tcp_update_dsack_list(tp, save_start, save_start + save_tlen); } else if (tlen > 0) { tcp_update_dsack_list(tp, save_start, save_start + tlen); } } #if 0 /* * Note the amount of data that peer has sent into * our window, in order to estimate the sender's * buffer size. * XXX: Unused. */ if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); else len = so->so_rcv.sb_hiwat; #endif } else { m_freem(m); thflags &= ~TH_FIN; } /* * If FIN is received ACK the FIN and let the user know * that the connection is closing. */ if (thflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { socantrcvmore(so); /* * If connection is half-synchronized * (ie NEEDSYN flag on) then delay ACK, * so it may be piggybacked when SYN is sent. * Otherwise, since we received a FIN then no * more input can be expected, send ACK now. */ if (tp->t_flags & TF_NEEDSYN) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt++; } switch (tp->t_state) { /* * In SYN_RECEIVED and ESTABLISHED STATES * enter the CLOSE_WAIT state. */ case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; /* FALLTHROUGH */ case TCPS_ESTABLISHED: tcp_state_change(tp, TCPS_CLOSE_WAIT); break; /* * If still in FIN_WAIT_1 STATE FIN has not been acked so * enter the CLOSING state. */ case TCPS_FIN_WAIT_1: tcp_state_change(tp, TCPS_CLOSING); break; /* * In FIN_WAIT_2 state enter the TIME_WAIT state, * starting the time-wait timer, turning off the other * standard timers. */ case TCPS_FIN_WAIT_2: tcp_twstart(tp); return; } } #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); /* * Return any desired output. */ if (needoutput || (tp->t_flags & TF_ACKNOW)) (void) tp->t_fb->tfb_tcp_output(tp); check_delack: INP_WLOCK_ASSERT(tp->t_inpcb); if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); } INP_WUNLOCK(tp->t_inpcb); return; dropafterack: /* * Generate an ACK dropping incoming segment if it occupies * sequence space, where the ACK reflects our state. * * We can now skip the test for the RST flag since all * paths to this code happen after packets containing * RST have been dropped. * * In the SYN-RECEIVED state, don't send an ACK unless the * segment we received passes the SYN-RECEIVED ACK test. * If it fails send a RST. This breaks the loop in the * "LAND" DoS attack, and also prevents an ACK storm * between two listening ports that have been sent forged * SYN segments, each with the source address of the other. */ if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max)) ) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); tp->t_flags |= TF_ACKNOW; (void) tp->t_fb->tfb_tcp_output(tp); INP_WUNLOCK(tp->t_inpcb); m_freem(m); return; dropwithreset: if (tp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); INP_WUNLOCK(tp->t_inpcb); } else tcp_dropwithreset(m, th, NULL, tlen, rstreason); return; drop: /* * Drop space held by incoming segment and return. */ #ifdef TCPDEBUG if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); if (tp != NULL) INP_WUNLOCK(tp->t_inpcb); m_freem(m); } /* * Issue RST and make ACK acceptable to originator of segment. * The mbuf must still include the original packet header. * tp may be NULL. */ void tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int tlen, int rstreason) { #ifdef INET struct ip *ip; #endif #ifdef INET6 struct ip6_hdr *ip6; #endif if (tp != NULL) { INP_WLOCK_ASSERT(tp->t_inpcb); } /* Don't bother if destination was broadcast/multicast. */ if ((th->th_flags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) goto drop; #ifdef INET6 if (mtod(m, struct ip *)->ip_v == 6) { ip6 = mtod(m, struct ip6_hdr *); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) goto drop; /* IPv6 anycast check is done at tcp6_input() */ } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { ip = mtod(m, struct ip *); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) goto drop; } #endif /* Perform bandwidth limiting. */ if (badport_bandlim(rstreason) < 0) goto drop; /* tcp_respond consumes the mbuf chain. */ if (th->th_flags & TH_ACK) { tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack, TH_RST); } else { if (th->th_flags & TH_SYN) tlen++; if (th->th_flags & TH_FIN) tlen++; tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, (tcp_seq)0, TH_RST|TH_ACK); } return; drop: m_freem(m); } /* * Parse TCP options and place in tcpopt. */ void tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) { int opt, optlen; to->to_flags = 0; for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { if (cnt < 2) break; optlen = cp[1]; if (optlen < 2 || optlen > cnt) break; } switch (opt) { case TCPOPT_MAXSEG: if (optlen != TCPOLEN_MAXSEG) continue; if (!(flags & TO_SYN)) continue; to->to_flags |= TOF_MSS; bcopy((char *)cp + 2, (char *)&to->to_mss, sizeof(to->to_mss)); to->to_mss = ntohs(to->to_mss); break; case TCPOPT_WINDOW: if (optlen != TCPOLEN_WINDOW) continue; if (!(flags & TO_SYN)) continue; to->to_flags |= TOF_SCALE; to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT); break; case TCPOPT_TIMESTAMP: if (optlen != TCPOLEN_TIMESTAMP) continue; to->to_flags |= TOF_TS; bcopy((char *)cp + 2, (char *)&to->to_tsval, sizeof(to->to_tsval)); to->to_tsval = ntohl(to->to_tsval); bcopy((char *)cp + 6, (char *)&to->to_tsecr, sizeof(to->to_tsecr)); to->to_tsecr = ntohl(to->to_tsecr); break; case TCPOPT_SIGNATURE: /* * In order to reply to a host which has set the * TCP_SIGNATURE option in its initial SYN, we have * to record the fact that the option was observed * here for the syncache code to perform the correct * response. */ if (optlen != TCPOLEN_SIGNATURE) continue; to->to_flags |= TOF_SIGNATURE; to->to_signature = cp + 2; break; case TCPOPT_SACK_PERMITTED: if (optlen != TCPOLEN_SACK_PERMITTED) continue; if (!(flags & TO_SYN)) continue; if (!V_tcp_do_sack) continue; to->to_flags |= TOF_SACKPERM; break; case TCPOPT_SACK: if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) continue; if (flags & TO_SYN) continue; to->to_flags |= TOF_SACK; to->to_nsacks = (optlen - 2) / TCPOLEN_SACK; to->to_sacks = cp + 2; TCPSTAT_INC(tcps_sack_rcv_blocks); break; case TCPOPT_FAST_OPEN: /* * Cookie length validation is performed by the * server side cookie checking code or the client * side cookie cache update code. */ if (!(flags & TO_SYN)) continue; if (!V_tcp_fastopen_client_enable && !V_tcp_fastopen_server_enable) continue; to->to_flags |= TOF_FASTOPEN; to->to_tfo_len = optlen - 2; to->to_tfo_cookie = to->to_tfo_len ? cp + 2 : NULL; break; default: continue; } } } /* * Pull out of band byte out of a segment so * it doesn't appear in the user's data queue. * It is still reflected in the segment length for * sequencing purposes. */ void tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, int off) { int cnt = off + th->th_urp - 1; while (cnt >= 0) { if (m->m_len > cnt) { char *cp = mtod(m, caddr_t) + cnt; struct tcpcb *tp = sototcpcb(so); INP_WLOCK_ASSERT(tp->t_inpcb); tp->t_iobc = *cp; tp->t_oobflags |= TCPOOB_HAVEDATA; bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); m->m_len--; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len--; return; } cnt -= m->m_len; m = m->m_next; if (m == NULL) break; } panic("tcp_pulloutofband"); } /* * Collect new round-trip time estimate * and update averages and current timeout. */ void tcp_xmit_timer(struct tcpcb *tp, int rtt) { int delta; INP_WLOCK_ASSERT(tp->t_inpcb); TCPSTAT_INC(tcps_rttupdated); tp->t_rttupdated++; #ifdef STATS stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt * 1000 / hz)); #endif if ((tp->t_srtt != 0) && (tp->t_rxtshift <= TCP_RTT_INVALIDATE)) { /* * srtt is stored as fixed point with 5 bits after the * binary point (i.e., scaled by 8). The following magic * is equivalent to the smoothing algorithm in rfc793 with * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed * point). Adjust rtt to origin 0. */ delta = ((rtt - 1) << TCP_DELTA_SHIFT) - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); if ((tp->t_srtt += delta) <= 0) tp->t_srtt = 1; /* * We accumulate a smoothed rtt variance (actually, a * smoothed mean difference), then set the retransmit * timer to smoothed rtt + 4 times the smoothed variance. * rttvar is stored as fixed point with 4 bits after the * binary point (scaled by 16). The following is * equivalent to rfc793 smoothing with an alpha of .75 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces * rfc793's wired-in beta. */ if (delta < 0) delta = -delta; delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); if ((tp->t_rttvar += delta) <= 0) tp->t_rttvar = 1; if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } else { /* * No rtt measurement yet - use the unsmoothed rtt. * Set the variance to half the rtt (so our first * retransmit happens at 3*rtt). */ tp->t_srtt = rtt << TCP_RTT_SHIFT; tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } tp->t_rtttime = 0; tp->t_rxtshift = 0; /* * the retransmit should happen at rtt + 4 * rttvar. * Because of the way we do the smoothing, srtt and rttvar * will each average +1/2 tick of bias. When we compute * the retransmit timer, we want 1/2 tick of rounding and * 1 extra tick because of +-1/2 tick uncertainty in the * firing of the timer. The bias will give us exactly the * 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below * the minimum feasible timer (which is 2 ticks). */ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); /* * We received an ack for a packet that wasn't retransmitted; * it is probably safe to discard any error indications we've * received recently. This isn't quite right, but close enough * for now (a route might have failed after we sent a segment, * and the return path might not be symmetrical). */ tp->t_softerror = 0; } /* * Determine a reasonable value for maxseg size. * If the route is known, check route for mtu. * If none, use an mss that can be handled on the outgoing interface * without forcing IP to fragment. If no route is found, route has no mtu, * or the destination isn't local, use a default, hopefully conservative * size (usually 512 or the default IP max size, but no more than the mtu * of the interface), as we can't discover anything about intervening * gateways or networks. We also initialize the congestion/slow start * window to be a single segment if the destination isn't local. * While looking at the routing entry, we also initialize other path-dependent * parameters from pre-set or cached values in the routing entry. * * NOTE that resulting t_maxseg doesn't include space for TCP options or * IP options, e.g. IPSEC data, since length of this data may vary, and * thus it is calculated for every segment separately in tcp_output(). * * NOTE that this routine is only called when we process an incoming * segment, or an ICMP need fragmentation datagram. Outgoing SYN/ACK MSS * settings are handled in tcp_mssopt(). */ void tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, struct hc_metrics_lite *metricptr, struct tcp_ifcap *cap) { int mss = 0; uint32_t maxmtu = 0; struct inpcb *inp = tp->t_inpcb; struct hc_metrics_lite metrics; #ifdef INET6 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; size_t min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : sizeof (struct tcpiphdr); #else const size_t min_protoh = sizeof(struct tcpiphdr); #endif INP_WLOCK_ASSERT(tp->t_inpcb); if (mtuoffer != -1) { KASSERT(offer == -1, ("%s: conflict", __func__)); offer = mtuoffer - min_protoh; } /* Initialize. */ #ifdef INET6 if (isipv6) { maxmtu = tcp_maxmtu6(&inp->inp_inc, cap); tp->t_maxseg = V_tcp_v6mssdflt; } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { maxmtu = tcp_maxmtu(&inp->inp_inc, cap); tp->t_maxseg = V_tcp_mssdflt; } #endif /* * No route to sender, stay with default mss and return. */ if (maxmtu == 0) { /* * In case we return early we need to initialize metrics * to a defined state as tcp_hc_get() would do for us * if there was no cache hit. */ if (metricptr != NULL) bzero(metricptr, sizeof(struct hc_metrics_lite)); return; } /* What have we got? */ switch (offer) { case 0: /* * Offer == 0 means that there was no MSS on the SYN * segment, in this case we use tcp_mssdflt as * already assigned to t_maxseg above. */ offer = tp->t_maxseg; break; case -1: /* * Offer == -1 means that we didn't receive SYN yet. */ /* FALLTHROUGH */ default: /* * Prevent DoS attack with too small MSS. Round up * to at least minmss. */ offer = max(offer, V_tcp_minmss); } /* * rmx information is now retrieved from tcp_hostcache. */ tcp_hc_get(&inp->inp_inc, &metrics); if (metricptr != NULL) bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite)); /* * If there's a discovered mtu in tcp hostcache, use it. * Else, use the link mtu. */ if (metrics.rmx_mtu) mss = min(metrics.rmx_mtu, maxmtu) - min_protoh; else { #ifdef INET6 if (isipv6) { mss = maxmtu - min_protoh; if (!V_path_mtu_discovery && !in6_localaddr(&inp->in6p_faddr)) mss = min(mss, V_tcp_v6mssdflt); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { mss = maxmtu - min_protoh; if (!V_path_mtu_discovery && !in_localaddr(inp->inp_faddr)) mss = min(mss, V_tcp_mssdflt); } #endif /* * XXX - The above conditional (mss = maxmtu - min_protoh) * probably violates the TCP spec. * The problem is that, since we don't know the * other end's MSS, we are supposed to use a conservative * default. But, if we do that, then MTU discovery will * never actually take place, because the conservative * default is much less than the MTUs typically seen * on the Internet today. For the moment, we'll sweep * this under the carpet. * * The conservative default might not actually be a problem * if the only case this occurs is when sending an initial * SYN with options and data to a host we've never talked * to before. Then, they will reply with an MSS value which * will get recorded and the new parameters should get * recomputed. For Further Study. */ } mss = min(mss, offer); /* * Sanity check: make sure that maxseg will be large * enough to allow some data on segments even if the * all the option space is used (40bytes). Otherwise * funny things may happen in tcp_output. * * XXXGL: shouldn't we reserve space for IP/IPv6 options? */ mss = max(mss, 64); tp->t_maxseg = mss; } void tcp_mss(struct tcpcb *tp, int offer) { int mss; uint32_t bufsize; struct inpcb *inp; struct socket *so; struct hc_metrics_lite metrics; struct tcp_ifcap cap; KASSERT(tp != NULL, ("%s: tp == NULL", __func__)); bzero(&cap, sizeof(cap)); tcp_mss_update(tp, offer, -1, &metrics, &cap); mss = tp->t_maxseg; inp = tp->t_inpcb; /* * If there's a pipesize, change the socket buffer to that size, * don't change if sb_hiwat is different than default (then it * has been changed on purpose with setsockopt). * Make the socket buffers an integral number of mss units; * if the mss is larger than the socket buffer, decrease the mss. */ so = inp->inp_socket; SOCKBUF_LOCK(&so->so_snd); if ((so->so_snd.sb_hiwat == V_tcp_sendspace) && metrics.rmx_sendpipe) bufsize = metrics.rmx_sendpipe; else bufsize = so->so_snd.sb_hiwat; if (bufsize < mss) mss = bufsize; else { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; if (bufsize > so->so_snd.sb_hiwat) (void)sbreserve_locked(&so->so_snd, bufsize, so, NULL); } SOCKBUF_UNLOCK(&so->so_snd); /* * Sanity check: make sure that maxseg will be large * enough to allow some data on segments even if the * all the option space is used (40bytes). Otherwise * funny things may happen in tcp_output. * * XXXGL: shouldn't we reserve space for IP/IPv6 options? */ tp->t_maxseg = max(mss, 64); SOCKBUF_LOCK(&so->so_rcv); if ((so->so_rcv.sb_hiwat == V_tcp_recvspace) && metrics.rmx_recvpipe) bufsize = metrics.rmx_recvpipe; else bufsize = so->so_rcv.sb_hiwat; if (bufsize > mss) { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; if (bufsize > so->so_rcv.sb_hiwat) (void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL); } SOCKBUF_UNLOCK(&so->so_rcv); /* Check the interface for TSO capabilities. */ if (cap.ifcap & CSUM_TSO) { tp->t_flags |= TF_TSO; tp->t_tsomax = cap.tsomax; tp->t_tsomaxsegcount = cap.tsomaxsegcount; tp->t_tsomaxsegsize = cap.tsomaxsegsize; } } /* * Determine the MSS option to send on an outgoing SYN. */ int tcp_mssopt(struct in_conninfo *inc) { int mss = 0; uint32_t thcmtu = 0; uint32_t maxmtu = 0; size_t min_protoh; KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer")); #ifdef INET6 if (inc->inc_flags & INC_ISIPV6) { mss = V_tcp_v6mssdflt; maxmtu = tcp_maxmtu6(inc, NULL); min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { mss = V_tcp_mssdflt; maxmtu = tcp_maxmtu(inc, NULL); min_protoh = sizeof(struct tcpiphdr); } #endif #if defined(INET6) || defined(INET) thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ #endif if (maxmtu && thcmtu) mss = min(maxmtu, thcmtu) - min_protoh; else if (maxmtu || thcmtu) mss = max(maxmtu, thcmtu) - min_protoh; return (mss); } /* * On a partial ack arrives, force the retransmission of the * next unacknowledged segment. Do not clear tp->t_dupacks. * By setting snd_nxt to ti_ack, this forces retransmission timer to * be started again. */ void tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) { tcp_seq onxt = tp->snd_nxt; uint32_t ocwnd = tp->snd_cwnd; u_int maxseg = tcp_maxseg(tp); INP_WLOCK_ASSERT(tp->t_inpcb); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; tp->snd_nxt = th->th_ack; /* * Set snd_cwnd to one segment beyond acknowledged offset. * (tp->snd_una has not yet been updated when this function is called.) */ tp->snd_cwnd = maxseg + BYTES_THIS_ACK(tp, th); tp->t_flags |= TF_ACKNOW; (void) tp->t_fb->tfb_tcp_output(tp); tp->snd_cwnd = ocwnd; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; /* * Partial window deflation. Relies on fact that tp->snd_una * not updated yet. */ if (tp->snd_cwnd > BYTES_THIS_ACK(tp, th)) tp->snd_cwnd -= BYTES_THIS_ACK(tp, th); else tp->snd_cwnd = 0; tp->snd_cwnd += maxseg; } int tcp_compute_pipe(struct tcpcb *tp) { return (tp->snd_max - tp->snd_una + tp->sackhint.sack_bytes_rexmit - tp->sackhint.sacked_bytes); } uint32_t tcp_compute_initwnd(uint32_t maxseg) { /* * Calculate the Initial Window, also used as Restart Window * * RFC5681 Section 3.1 specifies the default conservative values. * RFC3390 specifies slightly more aggressive values. * RFC6928 increases it to ten segments. * Support for user specified value for initial flight size. */ if (V_tcp_initcwnd_segments) return min(V_tcp_initcwnd_segments * maxseg, max(2 * maxseg, V_tcp_initcwnd_segments * 1460)); else if (V_tcp_do_rfc3390) return min(4 * maxseg, max(2 * maxseg, 4380)); else { /* Per RFC5681 Section 3.1 */ if (maxseg > 2190) return (2 * maxseg); else if (maxseg > 1095) return (3 * maxseg); else return (4 * maxseg); } } Index: head/sys/netinet/tcp_log_buf.c =================================================================== --- head/sys/netinet/tcp_log_buf.c (revision 357857) +++ head/sys/netinet/tcp_log_buf.c (revision 357858) @@ -1,2639 +1,2638 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2016-2018 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Must come after qmath.h and tree.h */ #include #include #include #include #include #include #include #include #include #include /* Default expiry time */ #define TCP_LOG_EXPIRE_TIME ((sbintime_t)60 * SBT_1S) /* Max interval at which to run the expiry timer */ #define TCP_LOG_EXPIRE_INTVL ((sbintime_t)5 * SBT_1S) bool tcp_log_verbose; static uma_zone_t tcp_log_bucket_zone, tcp_log_node_zone, tcp_log_zone; static int tcp_log_session_limit = TCP_LOG_BUF_DEFAULT_SESSION_LIMIT; static uint32_t tcp_log_version = TCP_LOG_BUF_VER; RB_HEAD(tcp_log_id_tree, tcp_log_id_bucket); static struct tcp_log_id_tree tcp_log_id_head; static STAILQ_HEAD(, tcp_log_id_node) tcp_log_expireq_head = STAILQ_HEAD_INITIALIZER(tcp_log_expireq_head); static struct mtx tcp_log_expireq_mtx; static struct callout tcp_log_expireq_callout; static u_long tcp_log_auto_ratio = 0; static volatile u_long tcp_log_auto_ratio_cur = 0; static uint32_t tcp_log_auto_mode = TCP_LOG_STATE_TAIL; static bool tcp_log_auto_all = false; static uint32_t tcp_disable_all_bb_logs = 0; RB_PROTOTYPE_STATIC(tcp_log_id_tree, tcp_log_id_bucket, tlb_rb, tcp_log_id_cmp) SYSCTL_NODE(_net_inet_tcp, OID_AUTO, bb, CTLFLAG_RW, 0, "TCP Black Box controls"); SYSCTL_BOOL(_net_inet_tcp_bb, OID_AUTO, log_verbose, CTLFLAG_RW, &tcp_log_verbose, 0, "Force verbose logging for TCP traces"); SYSCTL_INT(_net_inet_tcp_bb, OID_AUTO, log_session_limit, CTLFLAG_RW, &tcp_log_session_limit, 0, "Maximum number of events maintained for each TCP session"); SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_global_limit, CTLFLAG_RW, &tcp_log_zone, "Maximum number of events maintained for all TCP sessions"); SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_global_entries, CTLFLAG_RD, &tcp_log_zone, "Current number of events maintained for all TCP sessions"); SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_id_limit, CTLFLAG_RW, &tcp_log_bucket_zone, "Maximum number of log IDs"); SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_id_entries, CTLFLAG_RD, &tcp_log_bucket_zone, "Current number of log IDs"); SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_id_tcpcb_limit, CTLFLAG_RW, &tcp_log_node_zone, "Maximum number of tcpcbs with log IDs"); SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_id_tcpcb_entries, CTLFLAG_RD, &tcp_log_node_zone, "Current number of tcpcbs with log IDs"); SYSCTL_U32(_net_inet_tcp_bb, OID_AUTO, log_version, CTLFLAG_RD, &tcp_log_version, 0, "Version of log formats exported"); SYSCTL_U32(_net_inet_tcp_bb, OID_AUTO, disable_all, CTLFLAG_RW, &tcp_disable_all_bb_logs, TCP_LOG_STATE_HEAD_AUTO, "Disable all BB logging for all connections"); SYSCTL_ULONG(_net_inet_tcp_bb, OID_AUTO, log_auto_ratio, CTLFLAG_RW, &tcp_log_auto_ratio, 0, "Do auto capturing for 1 out of N sessions"); SYSCTL_U32(_net_inet_tcp_bb, OID_AUTO, log_auto_mode, CTLFLAG_RW, &tcp_log_auto_mode, TCP_LOG_STATE_HEAD_AUTO, "Logging mode for auto-selected sessions (default is TCP_LOG_STATE_HEAD_AUTO)"); SYSCTL_BOOL(_net_inet_tcp_bb, OID_AUTO, log_auto_all, CTLFLAG_RW, &tcp_log_auto_all, false, "Auto-select from all sessions (rather than just those with IDs)"); #ifdef TCPLOG_DEBUG_COUNTERS counter_u64_t tcp_log_queued; counter_u64_t tcp_log_que_fail1; counter_u64_t tcp_log_que_fail2; counter_u64_t tcp_log_que_fail3; counter_u64_t tcp_log_que_fail4; counter_u64_t tcp_log_que_fail5; counter_u64_t tcp_log_que_copyout; counter_u64_t tcp_log_que_read; counter_u64_t tcp_log_que_freed; SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, queued, CTLFLAG_RD, &tcp_log_queued, "Number of entries queued"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail1, CTLFLAG_RD, &tcp_log_que_fail1, "Number of entries queued but fail 1"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail2, CTLFLAG_RD, &tcp_log_que_fail2, "Number of entries queued but fail 2"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail3, CTLFLAG_RD, &tcp_log_que_fail3, "Number of entries queued but fail 3"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail4, CTLFLAG_RD, &tcp_log_que_fail4, "Number of entries queued but fail 4"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail5, CTLFLAG_RD, &tcp_log_que_fail5, "Number of entries queued but fail 4"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, copyout, CTLFLAG_RD, &tcp_log_que_copyout, "Number of entries copied out"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, read, CTLFLAG_RD, &tcp_log_que_read, "Number of entries read from the queue"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, freed, CTLFLAG_RD, &tcp_log_que_freed, "Number of entries freed after reading"); #endif #ifdef INVARIANTS #define TCPLOG_DEBUG_RINGBUF #endif /* Number of requests to consider a PBCID "active". */ #define ACTIVE_REQUEST_COUNT 10 /* Statistic tracking for "active" PBCIDs. */ static counter_u64_t tcp_log_pcb_ids_cur; static counter_u64_t tcp_log_pcb_ids_tot; SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, pcb_ids_cur, CTLFLAG_RD, &tcp_log_pcb_ids_cur, "Number of pcb IDs allocated in the system"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, pcb_ids_tot, CTLFLAG_RD, &tcp_log_pcb_ids_tot, "Total number of pcb IDs that have been allocated"); struct tcp_log_mem { STAILQ_ENTRY(tcp_log_mem) tlm_queue; struct tcp_log_buffer tlm_buf; struct tcp_log_verbose tlm_v; #ifdef TCPLOG_DEBUG_RINGBUF volatile int tlm_refcnt; #endif }; /* 60 bytes for the header, + 16 bytes for padding */ static uint8_t zerobuf[76]; /* * Lock order: * 1. TCPID_TREE * 2. TCPID_BUCKET * 3. INP * * Rules: * A. You need a lock on the Tree to add/remove buckets. * B. You need a lock on the bucket to add/remove nodes from the bucket. * C. To change information in a node, you need the INP lock if the tln_closed * field is false. Otherwise, you need the bucket lock. (Note that the * tln_closed field can change at any point, so you need to recheck the * entry after acquiring the INP lock.) * D. To remove a node from the bucket, you must have that entry locked, * according to the criteria of Rule C. Also, the node must not be on * the expiry queue. * E. The exception to C is the expiry queue fields, which are locked by * the TCPLOG_EXPIREQ lock. * * Buckets have a reference count. Each node is a reference. Further, * other callers may add reference counts to keep a bucket from disappearing. * You can add a reference as long as you own a lock sufficient to keep the * bucket from disappearing. For example, a common use is: * a. Have a locked INP, but need to lock the TCPID_BUCKET. * b. Add a refcount on the bucket. (Safe because the INP lock prevents * the TCPID_BUCKET from going away.) * c. Drop the INP lock. * d. Acquire a lock on the TCPID_BUCKET. * e. Acquire a lock on the INP. * f. Drop the refcount on the bucket. * (At this point, the bucket may disappear.) * * Expire queue lock: * You can acquire this with either the bucket or INP lock. Don't reverse it. * When the expire code has committed to freeing a node, it resets the expiry * time to SBT_MAX. That is the signal to everyone else that they should * leave that node alone. */ static struct rwlock tcp_id_tree_lock; #define TCPID_TREE_WLOCK() rw_wlock(&tcp_id_tree_lock) #define TCPID_TREE_RLOCK() rw_rlock(&tcp_id_tree_lock) #define TCPID_TREE_UPGRADE() rw_try_upgrade(&tcp_id_tree_lock) #define TCPID_TREE_WUNLOCK() rw_wunlock(&tcp_id_tree_lock) #define TCPID_TREE_RUNLOCK() rw_runlock(&tcp_id_tree_lock) #define TCPID_TREE_WLOCK_ASSERT() rw_assert(&tcp_id_tree_lock, RA_WLOCKED) #define TCPID_TREE_RLOCK_ASSERT() rw_assert(&tcp_id_tree_lock, RA_RLOCKED) #define TCPID_TREE_UNLOCK_ASSERT() rw_assert(&tcp_id_tree_lock, RA_UNLOCKED) #define TCPID_BUCKET_LOCK_INIT(tlb) mtx_init(&((tlb)->tlb_mtx), "tcp log id bucket", NULL, MTX_DEF) #define TCPID_BUCKET_LOCK_DESTROY(tlb) mtx_destroy(&((tlb)->tlb_mtx)) #define TCPID_BUCKET_LOCK(tlb) mtx_lock(&((tlb)->tlb_mtx)) #define TCPID_BUCKET_UNLOCK(tlb) mtx_unlock(&((tlb)->tlb_mtx)) #define TCPID_BUCKET_LOCK_ASSERT(tlb) mtx_assert(&((tlb)->tlb_mtx), MA_OWNED) #define TCPID_BUCKET_UNLOCK_ASSERT(tlb) mtx_assert(&((tlb)->tlb_mtx), MA_NOTOWNED) #define TCPID_BUCKET_REF(tlb) refcount_acquire(&((tlb)->tlb_refcnt)) #define TCPID_BUCKET_UNREF(tlb) refcount_release(&((tlb)->tlb_refcnt)) #define TCPLOG_EXPIREQ_LOCK() mtx_lock(&tcp_log_expireq_mtx) #define TCPLOG_EXPIREQ_UNLOCK() mtx_unlock(&tcp_log_expireq_mtx) SLIST_HEAD(tcp_log_id_head, tcp_log_id_node); struct tcp_log_id_bucket { /* * tlb_id must be first. This lets us use strcmp on * (struct tcp_log_id_bucket *) and (char *) interchangeably. */ char tlb_id[TCP_LOG_ID_LEN]; char tlb_tag[TCP_LOG_TAG_LEN]; RB_ENTRY(tcp_log_id_bucket) tlb_rb; struct tcp_log_id_head tlb_head; struct mtx tlb_mtx; volatile u_int tlb_refcnt; volatile u_int tlb_reqcnt; uint32_t tlb_loglimit; uint8_t tlb_logstate; }; struct tcp_log_id_node { SLIST_ENTRY(tcp_log_id_node) tln_list; STAILQ_ENTRY(tcp_log_id_node) tln_expireq; /* Locked by the expireq lock */ sbintime_t tln_expiretime; /* Locked by the expireq lock */ /* * If INP is NULL, that means the connection has closed. We've * saved the connection endpoint information and the log entries * in the tln_ie and tln_entries members. We've also saved a pointer * to the enclosing bucket here. If INP is not NULL, the information is * in the PCB and not here. */ struct inpcb *tln_inp; struct tcpcb *tln_tp; struct tcp_log_id_bucket *tln_bucket; struct in_endpoints tln_ie; struct tcp_log_stailq tln_entries; int tln_count; volatile int tln_closed; uint8_t tln_af; }; enum tree_lock_state { TREE_UNLOCKED = 0, TREE_RLOCKED, TREE_WLOCKED, }; /* Do we want to select this session for auto-logging? */ static __inline bool tcp_log_selectauto(void) { /* * If we are doing auto-capturing, figure out whether we will capture * this session. */ if (tcp_log_auto_ratio && (tcp_disable_all_bb_logs == 0) && (atomic_fetchadd_long(&tcp_log_auto_ratio_cur, 1) % tcp_log_auto_ratio) == 0) return (true); return (false); } static __inline int tcp_log_id_cmp(struct tcp_log_id_bucket *a, struct tcp_log_id_bucket *b) { KASSERT(a != NULL, ("tcp_log_id_cmp: argument a is unexpectedly NULL")); KASSERT(b != NULL, ("tcp_log_id_cmp: argument b is unexpectedly NULL")); return strncmp(a->tlb_id, b->tlb_id, TCP_LOG_ID_LEN); } RB_GENERATE_STATIC(tcp_log_id_tree, tcp_log_id_bucket, tlb_rb, tcp_log_id_cmp) static __inline void tcp_log_id_validate_tree_lock(int tree_locked) { #ifdef INVARIANTS switch (tree_locked) { case TREE_WLOCKED: TCPID_TREE_WLOCK_ASSERT(); break; case TREE_RLOCKED: TCPID_TREE_RLOCK_ASSERT(); break; case TREE_UNLOCKED: TCPID_TREE_UNLOCK_ASSERT(); break; default: kassert_panic("%s:%d: unknown tree lock state", __func__, __LINE__); } #endif } static __inline void tcp_log_remove_bucket(struct tcp_log_id_bucket *tlb) { TCPID_TREE_WLOCK_ASSERT(); KASSERT(SLIST_EMPTY(&tlb->tlb_head), ("%s: Attempt to remove non-empty bucket", __func__)); if (RB_REMOVE(tcp_log_id_tree, &tcp_log_id_head, tlb) == NULL) { #ifdef INVARIANTS kassert_panic("%s:%d: error removing element from tree", __func__, __LINE__); #endif } TCPID_BUCKET_LOCK_DESTROY(tlb); counter_u64_add(tcp_log_pcb_ids_cur, (int64_t)-1); uma_zfree(tcp_log_bucket_zone, tlb); } /* * Call with a referenced and locked bucket. * Will return true if the bucket was freed; otherwise, false. * tlb: The bucket to unreference. * tree_locked: A pointer to the state of the tree lock. If the tree lock * state changes, the function will update it. * inp: If not NULL and the function needs to drop the inp lock to relock the * tree, it will do so. (The caller must ensure inp will not become invalid, * probably by holding a reference to it.) */ static bool tcp_log_unref_bucket(struct tcp_log_id_bucket *tlb, int *tree_locked, struct inpcb *inp) { KASSERT(tlb != NULL, ("%s: called with NULL tlb", __func__)); KASSERT(tree_locked != NULL, ("%s: called with NULL tree_locked", __func__)); tcp_log_id_validate_tree_lock(*tree_locked); /* * Did we hold the last reference on the tlb? If so, we may need * to free it. (Note that we can realistically only execute the * loop twice: once without a write lock and once with a write * lock.) */ while (TCPID_BUCKET_UNREF(tlb)) { /* * We need a write lock on the tree to free this. * If we can upgrade the tree lock, this is "easy". If we * can't upgrade the tree lock, we need to do this the * "hard" way: unwind all our locks and relock everything. * In the meantime, anything could have changed. We even * need to validate that we still need to free the bucket. */ if (*tree_locked == TREE_RLOCKED && TCPID_TREE_UPGRADE()) *tree_locked = TREE_WLOCKED; else if (*tree_locked != TREE_WLOCKED) { TCPID_BUCKET_REF(tlb); if (inp != NULL) INP_WUNLOCK(inp); TCPID_BUCKET_UNLOCK(tlb); if (*tree_locked == TREE_RLOCKED) TCPID_TREE_RUNLOCK(); TCPID_TREE_WLOCK(); *tree_locked = TREE_WLOCKED; TCPID_BUCKET_LOCK(tlb); if (inp != NULL) INP_WLOCK(inp); continue; } /* * We have an empty bucket and a write lock on the tree. * Remove the empty bucket. */ tcp_log_remove_bucket(tlb); return (true); } return (false); } /* * Call with a locked bucket. This function will release the lock on the * bucket before returning. * * The caller is responsible for freeing the tp->t_lin/tln node! * * Note: one of tp or both tlb and tln must be supplied. * * inp: A pointer to the inp. If the function needs to drop the inp lock to * acquire the tree write lock, it will do so. (The caller must ensure inp * will not become invalid, probably by holding a reference to it.) * tp: A pointer to the tcpcb. (optional; if specified, tlb and tln are ignored) * tlb: A pointer to the bucket. (optional; ignored if tp is specified) * tln: A pointer to the node. (optional; ignored if tp is specified) * tree_locked: A pointer to the state of the tree lock. If the tree lock * state changes, the function will update it. * * Will return true if the INP lock was reacquired; otherwise, false. */ static bool tcp_log_remove_id_node(struct inpcb *inp, struct tcpcb *tp, struct tcp_log_id_bucket *tlb, struct tcp_log_id_node *tln, int *tree_locked) { int orig_tree_locked; KASSERT(tp != NULL || (tlb != NULL && tln != NULL), ("%s: called with tp=%p, tlb=%p, tln=%p", __func__, tp, tlb, tln)); KASSERT(tree_locked != NULL, ("%s: called with NULL tree_locked", __func__)); if (tp != NULL) { tlb = tp->t_lib; tln = tp->t_lin; KASSERT(tlb != NULL, ("%s: unexpectedly NULL tlb", __func__)); KASSERT(tln != NULL, ("%s: unexpectedly NULL tln", __func__)); } tcp_log_id_validate_tree_lock(*tree_locked); TCPID_BUCKET_LOCK_ASSERT(tlb); /* * Remove the node, clear the log bucket and node from the TCPCB, and * decrement the bucket refcount. In the process, if this is the * last reference, the bucket will be freed. */ SLIST_REMOVE(&tlb->tlb_head, tln, tcp_log_id_node, tln_list); if (tp != NULL) { tp->t_lib = NULL; tp->t_lin = NULL; } orig_tree_locked = *tree_locked; if (!tcp_log_unref_bucket(tlb, tree_locked, inp)) TCPID_BUCKET_UNLOCK(tlb); return (*tree_locked != orig_tree_locked); } #define RECHECK_INP_CLEAN(cleanup) do { \ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { \ rv = ECONNRESET; \ cleanup; \ goto done; \ } \ tp = intotcpcb(inp); \ } while (0) #define RECHECK_INP() RECHECK_INP_CLEAN(/* noop */) static void tcp_log_grow_tlb(char *tlb_id, struct tcpcb *tp) { INP_WLOCK_ASSERT(tp->t_inpcb); #ifdef STATS if (V_tcp_perconn_stats_enable == 2 && tp->t_stats == NULL) (void)tcp_stats_sample_rollthedice(tp, tlb_id, strlen(tlb_id)); #endif } static void tcp_log_increment_reqcnt(struct tcp_log_id_bucket *tlb) { atomic_fetchadd_int(&tlb->tlb_reqcnt, 1); } /* * Associate the specified tag with a particular TCP log ID. * Called with INPCB locked. Returns with it unlocked. * Returns 0 on success or EOPNOTSUPP if the connection has no TCP log ID. */ int tcp_log_set_tag(struct tcpcb *tp, char *tag) { struct tcp_log_id_bucket *tlb; int tree_locked; INP_WLOCK_ASSERT(tp->t_inpcb); tree_locked = TREE_UNLOCKED; tlb = tp->t_lib; if (tlb == NULL) { INP_WUNLOCK(tp->t_inpcb); return (EOPNOTSUPP); } TCPID_BUCKET_REF(tlb); INP_WUNLOCK(tp->t_inpcb); TCPID_BUCKET_LOCK(tlb); strlcpy(tlb->tlb_tag, tag, TCP_LOG_TAG_LEN); if (!tcp_log_unref_bucket(tlb, &tree_locked, NULL)) TCPID_BUCKET_UNLOCK(tlb); if (tree_locked == TREE_WLOCKED) { TCPID_TREE_WLOCK_ASSERT(); TCPID_TREE_WUNLOCK(); } else if (tree_locked == TREE_RLOCKED) { TCPID_TREE_RLOCK_ASSERT(); TCPID_TREE_RUNLOCK(); } else TCPID_TREE_UNLOCK_ASSERT(); return (0); } /* * Set the TCP log ID for a TCPCB. * Called with INPCB locked. Returns with it unlocked. */ int tcp_log_set_id(struct tcpcb *tp, char *id) { struct tcp_log_id_bucket *tlb, *tmp_tlb; struct tcp_log_id_node *tln; struct inpcb *inp; int tree_locked, rv; bool bucket_locked; tlb = NULL; tln = NULL; inp = tp->t_inpcb; tree_locked = TREE_UNLOCKED; bucket_locked = false; restart: INP_WLOCK_ASSERT(inp); /* See if the ID is unchanged. */ if ((tp->t_lib != NULL && !strcmp(tp->t_lib->tlb_id, id)) || (tp->t_lib == NULL && *id == 0)) { if (tp->t_lib != NULL) { tcp_log_increment_reqcnt(tp->t_lib); if ((tp->t_lib->tlb_logstate) && (tp->t_log_state_set == 0)) { /* Clone in any logging */ tp->t_logstate = tp->t_lib->tlb_logstate; } if ((tp->t_lib->tlb_loglimit) && (tp->t_log_state_set == 0)) { /* We also have a limit set */ tp->t_loglimit = tp->t_lib->tlb_loglimit; } } rv = 0; goto done; } /* * If the TCPCB had a previous ID, we need to extricate it from * the previous list. * * Drop the TCPCB lock and lock the tree and the bucket. * Because this is called in the socket context, we (theoretically) * don't need to worry about the INPCB completely going away * while we are gone. */ if (tp->t_lib != NULL) { tlb = tp->t_lib; TCPID_BUCKET_REF(tlb); INP_WUNLOCK(inp); if (tree_locked == TREE_UNLOCKED) { TCPID_TREE_RLOCK(); tree_locked = TREE_RLOCKED; } TCPID_BUCKET_LOCK(tlb); bucket_locked = true; INP_WLOCK(inp); /* * Unreference the bucket. If our bucket went away, it is no * longer locked or valid. */ if (tcp_log_unref_bucket(tlb, &tree_locked, inp)) { bucket_locked = false; tlb = NULL; } /* Validate the INP. */ RECHECK_INP(); /* * Evaluate whether the bucket changed while we were unlocked. * * Possible scenarios here: * 1. Bucket is unchanged and the same one we started with. * 2. The TCPCB no longer has a bucket and our bucket was * freed. * 3. The TCPCB has a new bucket, whether ours was freed. * 4. The TCPCB no longer has a bucket and our bucket was * not freed. * * In cases 2-4, we will start over. In case 1, we will * proceed here to remove the bucket. */ if (tlb == NULL || tp->t_lib != tlb) { KASSERT(bucket_locked || tlb == NULL, ("%s: bucket_locked (%d) and tlb (%p) are " "inconsistent", __func__, bucket_locked, tlb)); if (bucket_locked) { TCPID_BUCKET_UNLOCK(tlb); bucket_locked = false; tlb = NULL; } goto restart; } /* * Store the (struct tcp_log_id_node) for reuse. Then, remove * it from the bucket. In the process, we may end up relocking. * If so, we need to validate that the INP is still valid, and * the TCPCB entries match we expect. * * We will clear tlb and change the bucket_locked state just * before calling tcp_log_remove_id_node(), since that function * will unlock the bucket. */ if (tln != NULL) uma_zfree(tcp_log_node_zone, tln); tln = tp->t_lin; tlb = NULL; bucket_locked = false; if (tcp_log_remove_id_node(inp, tp, NULL, NULL, &tree_locked)) { RECHECK_INP(); /* * If the TCPCB moved to a new bucket while we had * dropped the lock, restart. */ if (tp->t_lib != NULL || tp->t_lin != NULL) goto restart; } /* * Yay! We successfully removed the TCPCB from its old * bucket. Phew! * * On to bigger and better things... */ } /* At this point, the TCPCB should not be in any bucket. */ KASSERT(tp->t_lib == NULL, ("%s: tp->t_lib is not NULL", __func__)); /* * If the new ID is not empty, we need to now assign this TCPCB to a * new bucket. */ if (*id) { /* Get a new tln, if we don't already have one to reuse. */ if (tln == NULL) { tln = uma_zalloc(tcp_log_node_zone, M_NOWAIT | M_ZERO); if (tln == NULL) { rv = ENOBUFS; goto done; } tln->tln_inp = inp; tln->tln_tp = tp; } /* * Drop the INP lock for a bit. We don't need it, and dropping * it prevents lock order reversals. */ INP_WUNLOCK(inp); /* Make sure we have at least a read lock on the tree. */ tcp_log_id_validate_tree_lock(tree_locked); if (tree_locked == TREE_UNLOCKED) { TCPID_TREE_RLOCK(); tree_locked = TREE_RLOCKED; } refind: /* * Remember that we constructed (struct tcp_log_id_node) so * we can safely cast the id to it for the purposes of finding. */ KASSERT(tlb == NULL, ("%s:%d tlb unexpectedly non-NULL", __func__, __LINE__)); tmp_tlb = RB_FIND(tcp_log_id_tree, &tcp_log_id_head, (struct tcp_log_id_bucket *) id); /* * If we didn't find a matching bucket, we need to add a new * one. This requires a write lock. But, of course, we will * need to recheck some things when we re-acquire the lock. */ if (tmp_tlb == NULL && tree_locked != TREE_WLOCKED) { tree_locked = TREE_WLOCKED; if (!TCPID_TREE_UPGRADE()) { TCPID_TREE_RUNLOCK(); TCPID_TREE_WLOCK(); /* * The tree may have changed while we were * unlocked. */ goto refind; } } /* If we need to add a new bucket, do it now. */ if (tmp_tlb == NULL) { /* Allocate new bucket. */ tlb = uma_zalloc(tcp_log_bucket_zone, M_NOWAIT); if (tlb == NULL) { rv = ENOBUFS; goto done_noinp; } counter_u64_add(tcp_log_pcb_ids_cur, 1); counter_u64_add(tcp_log_pcb_ids_tot, 1); if ((tcp_log_auto_all == false) && tcp_log_auto_mode && tcp_log_selectauto()) { /* Save off the log state */ tlb->tlb_logstate = tcp_log_auto_mode; } else tlb->tlb_logstate = TCP_LOG_STATE_OFF; tlb->tlb_loglimit = 0; tlb->tlb_tag[0] = '\0'; /* Default to an empty tag. */ /* * Copy the ID to the bucket. * NB: Don't use strlcpy() unless you are sure * we've always validated NULL termination. * * TODO: When I'm done writing this, see if we * we have correctly validated NULL termination and * can use strlcpy(). :-) */ strncpy(tlb->tlb_id, id, TCP_LOG_ID_LEN - 1); tlb->tlb_id[TCP_LOG_ID_LEN - 1] = '\0'; /* * Take the refcount for the first node and go ahead * and lock this. Note that we zero the tlb_mtx * structure, since 0xdeadc0de flips the right bits * for the code to think that this mutex has already * been initialized. :-( */ SLIST_INIT(&tlb->tlb_head); refcount_init(&tlb->tlb_refcnt, 1); tlb->tlb_reqcnt = 1; memset(&tlb->tlb_mtx, 0, sizeof(struct mtx)); TCPID_BUCKET_LOCK_INIT(tlb); TCPID_BUCKET_LOCK(tlb); bucket_locked = true; #define FREE_NEW_TLB() do { \ TCPID_BUCKET_LOCK_DESTROY(tlb); \ uma_zfree(tcp_log_bucket_zone, tlb); \ counter_u64_add(tcp_log_pcb_ids_cur, (int64_t)-1); \ counter_u64_add(tcp_log_pcb_ids_tot, (int64_t)-1); \ bucket_locked = false; \ tlb = NULL; \ } while (0) /* * Relock the INP and make sure we are still * unassigned. */ INP_WLOCK(inp); RECHECK_INP_CLEAN(FREE_NEW_TLB()); if (tp->t_lib != NULL) { FREE_NEW_TLB(); goto restart; } /* Add the new bucket to the tree. */ tmp_tlb = RB_INSERT(tcp_log_id_tree, &tcp_log_id_head, tlb); KASSERT(tmp_tlb == NULL, ("%s: Unexpected conflicting bucket (%p) while " "adding new bucket (%p)", __func__, tmp_tlb, tlb)); /* * If we found a conflicting bucket, free the new * one we made and fall through to use the existing * bucket. */ if (tmp_tlb != NULL) { FREE_NEW_TLB(); INP_WUNLOCK(inp); } #undef FREE_NEW_TLB } /* If we found an existing bucket, use it. */ if (tmp_tlb != NULL) { tlb = tmp_tlb; TCPID_BUCKET_LOCK(tlb); bucket_locked = true; /* * Relock the INP and make sure we are still * unassigned. */ INP_UNLOCK_ASSERT(inp); INP_WLOCK(inp); RECHECK_INP(); if (tp->t_lib != NULL) { TCPID_BUCKET_UNLOCK(tlb); bucket_locked = false; tlb = NULL; goto restart; } /* Take a reference on the bucket. */ TCPID_BUCKET_REF(tlb); /* Record the request. */ tcp_log_increment_reqcnt(tlb); } tcp_log_grow_tlb(tlb->tlb_id, tp); /* Add the new node to the list. */ SLIST_INSERT_HEAD(&tlb->tlb_head, tln, tln_list); tp->t_lib = tlb; tp->t_lin = tln; if (tp->t_lib->tlb_logstate) { /* Clone in any logging */ tp->t_logstate = tp->t_lib->tlb_logstate; } if (tp->t_lib->tlb_loglimit) { /* The loglimit too */ tp->t_loglimit = tp->t_lib->tlb_loglimit; } tln = NULL; } rv = 0; done: /* Unlock things, as needed, and return. */ INP_WUNLOCK(inp); done_noinp: INP_UNLOCK_ASSERT(inp); if (bucket_locked) { TCPID_BUCKET_LOCK_ASSERT(tlb); TCPID_BUCKET_UNLOCK(tlb); } else if (tlb != NULL) TCPID_BUCKET_UNLOCK_ASSERT(tlb); if (tree_locked == TREE_WLOCKED) { TCPID_TREE_WLOCK_ASSERT(); TCPID_TREE_WUNLOCK(); } else if (tree_locked == TREE_RLOCKED) { TCPID_TREE_RLOCK_ASSERT(); TCPID_TREE_RUNLOCK(); } else TCPID_TREE_UNLOCK_ASSERT(); if (tln != NULL) uma_zfree(tcp_log_node_zone, tln); return (rv); } /* * Get the TCP log ID for a TCPCB. * Called with INPCB locked. * 'buf' must point to a buffer that is at least TCP_LOG_ID_LEN bytes long. * Returns number of bytes copied. */ size_t tcp_log_get_id(struct tcpcb *tp, char *buf) { size_t len; INP_LOCK_ASSERT(tp->t_inpcb); if (tp->t_lib != NULL) { len = strlcpy(buf, tp->t_lib->tlb_id, TCP_LOG_ID_LEN); KASSERT(len < TCP_LOG_ID_LEN, ("%s:%d: tp->t_lib->tlb_id too long (%zu)", __func__, __LINE__, len)); } else { *buf = '\0'; len = 0; } return (len); } /* * Get the tag associated with the TCPCB's log ID. * Called with INPCB locked. Returns with it unlocked. * 'buf' must point to a buffer that is at least TCP_LOG_TAG_LEN bytes long. * Returns number of bytes copied. */ size_t tcp_log_get_tag(struct tcpcb *tp, char *buf) { struct tcp_log_id_bucket *tlb; size_t len; int tree_locked; INP_WLOCK_ASSERT(tp->t_inpcb); tree_locked = TREE_UNLOCKED; tlb = tp->t_lib; if (tlb != NULL) { TCPID_BUCKET_REF(tlb); INP_WUNLOCK(tp->t_inpcb); TCPID_BUCKET_LOCK(tlb); len = strlcpy(buf, tlb->tlb_tag, TCP_LOG_TAG_LEN); KASSERT(len < TCP_LOG_TAG_LEN, ("%s:%d: tp->t_lib->tlb_tag too long (%zu)", __func__, __LINE__, len)); if (!tcp_log_unref_bucket(tlb, &tree_locked, NULL)) TCPID_BUCKET_UNLOCK(tlb); if (tree_locked == TREE_WLOCKED) { TCPID_TREE_WLOCK_ASSERT(); TCPID_TREE_WUNLOCK(); } else if (tree_locked == TREE_RLOCKED) { TCPID_TREE_RLOCK_ASSERT(); TCPID_TREE_RUNLOCK(); } else TCPID_TREE_UNLOCK_ASSERT(); } else { INP_WUNLOCK(tp->t_inpcb); *buf = '\0'; len = 0; } return (len); } /* * Get number of connections with the same log ID. * Log ID is taken from given TCPCB. * Called with INPCB locked. */ u_int tcp_log_get_id_cnt(struct tcpcb *tp) { INP_WLOCK_ASSERT(tp->t_inpcb); return ((tp->t_lib == NULL) ? 0 : tp->t_lib->tlb_refcnt); } #ifdef TCPLOG_DEBUG_RINGBUF /* * Functions/macros to increment/decrement reference count for a log * entry. This should catch when we do a double-free/double-remove or * a double-add. */ static inline void _tcp_log_entry_refcnt_add(struct tcp_log_mem *log_entry, const char *func, int line) { int refcnt; refcnt = atomic_fetchadd_int(&log_entry->tlm_refcnt, 1); if (refcnt != 0) panic("%s:%d: log_entry(%p)->tlm_refcnt is %d (expected 0)", func, line, log_entry, refcnt); } #define tcp_log_entry_refcnt_add(l) \ _tcp_log_entry_refcnt_add((l), __func__, __LINE__) static inline void _tcp_log_entry_refcnt_rem(struct tcp_log_mem *log_entry, const char *func, int line) { int refcnt; refcnt = atomic_fetchadd_int(&log_entry->tlm_refcnt, -1); if (refcnt != 1) panic("%s:%d: log_entry(%p)->tlm_refcnt is %d (expected 1)", func, line, log_entry, refcnt); } #define tcp_log_entry_refcnt_rem(l) \ _tcp_log_entry_refcnt_rem((l), __func__, __LINE__) #else /* !TCPLOG_DEBUG_RINGBUF */ #define tcp_log_entry_refcnt_add(l) #define tcp_log_entry_refcnt_rem(l) #endif /* * Cleanup after removing a log entry, but only decrement the count if we * are running INVARIANTS. */ static inline void tcp_log_free_log_common(struct tcp_log_mem *log_entry, int *count __unused) { uma_zfree(tcp_log_zone, log_entry); #ifdef INVARIANTS (*count)--; KASSERT(*count >= 0, ("%s: count unexpectedly negative", __func__)); #endif } static void tcp_log_free_entries(struct tcp_log_stailq *head, int *count) { struct tcp_log_mem *log_entry; /* Free the entries. */ while ((log_entry = STAILQ_FIRST(head)) != NULL) { STAILQ_REMOVE_HEAD(head, tlm_queue); tcp_log_entry_refcnt_rem(log_entry); tcp_log_free_log_common(log_entry, count); } } /* Cleanup after removing a log entry. */ static inline void tcp_log_remove_log_cleanup(struct tcpcb *tp, struct tcp_log_mem *log_entry) { uma_zfree(tcp_log_zone, log_entry); tp->t_lognum--; KASSERT(tp->t_lognum >= 0, ("%s: tp->t_lognum unexpectedly negative", __func__)); } /* Remove a log entry from the head of a list. */ static inline void tcp_log_remove_log_head(struct tcpcb *tp, struct tcp_log_mem *log_entry) { KASSERT(log_entry == STAILQ_FIRST(&tp->t_logs), ("%s: attempt to remove non-HEAD log entry", __func__)); STAILQ_REMOVE_HEAD(&tp->t_logs, tlm_queue); tcp_log_entry_refcnt_rem(log_entry); tcp_log_remove_log_cleanup(tp, log_entry); } #ifdef TCPLOG_DEBUG_RINGBUF /* * Initialize the log entry's reference count, which we want to * survive allocations. */ static int tcp_log_zone_init(void *mem, int size, int flags __unused) { struct tcp_log_mem *tlm; KASSERT(size >= sizeof(struct tcp_log_mem), ("%s: unexpectedly short (%d) allocation", __func__, size)); tlm = (struct tcp_log_mem *)mem; tlm->tlm_refcnt = 0; return (0); } /* * Double check that the refcnt is zero on allocation and return. */ static int tcp_log_zone_ctor(void *mem, int size, void *args __unused, int flags __unused) { struct tcp_log_mem *tlm; KASSERT(size >= sizeof(struct tcp_log_mem), ("%s: unexpectedly short (%d) allocation", __func__, size)); tlm = (struct tcp_log_mem *)mem; if (tlm->tlm_refcnt != 0) panic("%s:%d: tlm(%p)->tlm_refcnt is %d (expected 0)", __func__, __LINE__, tlm, tlm->tlm_refcnt); return (0); } static void tcp_log_zone_dtor(void *mem, int size, void *args __unused) { struct tcp_log_mem *tlm; KASSERT(size >= sizeof(struct tcp_log_mem), ("%s: unexpectedly short (%d) allocation", __func__, size)); tlm = (struct tcp_log_mem *)mem; if (tlm->tlm_refcnt != 0) panic("%s:%d: tlm(%p)->tlm_refcnt is %d (expected 0)", __func__, __LINE__, tlm, tlm->tlm_refcnt); } #endif /* TCPLOG_DEBUG_RINGBUF */ /* Do global initialization. */ void tcp_log_init(void) { tcp_log_zone = uma_zcreate("tcp_log", sizeof(struct tcp_log_mem), #ifdef TCPLOG_DEBUG_RINGBUF tcp_log_zone_ctor, tcp_log_zone_dtor, tcp_log_zone_init, #else NULL, NULL, NULL, #endif NULL, UMA_ALIGN_PTR, 0); (void)uma_zone_set_max(tcp_log_zone, TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT); tcp_log_bucket_zone = uma_zcreate("tcp_log_bucket", sizeof(struct tcp_log_id_bucket), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); tcp_log_node_zone = uma_zcreate("tcp_log_node", sizeof(struct tcp_log_id_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); #ifdef TCPLOG_DEBUG_COUNTERS tcp_log_queued = counter_u64_alloc(M_WAITOK); tcp_log_que_fail1 = counter_u64_alloc(M_WAITOK); tcp_log_que_fail2 = counter_u64_alloc(M_WAITOK); tcp_log_que_fail3 = counter_u64_alloc(M_WAITOK); tcp_log_que_fail4 = counter_u64_alloc(M_WAITOK); tcp_log_que_fail5 = counter_u64_alloc(M_WAITOK); tcp_log_que_copyout = counter_u64_alloc(M_WAITOK); tcp_log_que_read = counter_u64_alloc(M_WAITOK); tcp_log_que_freed = counter_u64_alloc(M_WAITOK); #endif tcp_log_pcb_ids_cur = counter_u64_alloc(M_WAITOK); tcp_log_pcb_ids_tot = counter_u64_alloc(M_WAITOK); rw_init_flags(&tcp_id_tree_lock, "TCP ID tree", RW_NEW); mtx_init(&tcp_log_expireq_mtx, "TCP log expireq", NULL, MTX_DEF); callout_init(&tcp_log_expireq_callout, 1); } /* Do per-TCPCB initialization. */ void tcp_log_tcpcbinit(struct tcpcb *tp) { /* A new TCPCB should start out zero-initialized. */ STAILQ_INIT(&tp->t_logs); /* * If we are doing auto-capturing, figure out whether we will capture * this session. */ tp->t_loglimit = tcp_log_session_limit; if ((tcp_log_auto_all == true) && tcp_log_auto_mode && tcp_log_selectauto()) { tp->t_logstate = tcp_log_auto_mode; tp->t_flags2 |= TF2_LOG_AUTO; } } /* Remove entries */ static void tcp_log_expire(void *unused __unused) { struct tcp_log_id_bucket *tlb; struct tcp_log_id_node *tln; sbintime_t expiry_limit; int tree_locked; TCPLOG_EXPIREQ_LOCK(); if (callout_pending(&tcp_log_expireq_callout)) { /* Callout was reset. */ TCPLOG_EXPIREQ_UNLOCK(); return; } /* * Process entries until we reach one that expires too far in the * future. Look one second in the future. */ expiry_limit = getsbinuptime() + SBT_1S; tree_locked = TREE_UNLOCKED; while ((tln = STAILQ_FIRST(&tcp_log_expireq_head)) != NULL && tln->tln_expiretime <= expiry_limit) { if (!callout_active(&tcp_log_expireq_callout)) { /* * Callout was stopped. I guess we should * just quit at this point. */ TCPLOG_EXPIREQ_UNLOCK(); return; } /* * Remove the node from the head of the list and unlock * the list. Change the expiry time to SBT_MAX as a signal * to other threads that we now own this. */ STAILQ_REMOVE_HEAD(&tcp_log_expireq_head, tln_expireq); tln->tln_expiretime = SBT_MAX; TCPLOG_EXPIREQ_UNLOCK(); /* * Remove the node from the bucket. */ tlb = tln->tln_bucket; TCPID_BUCKET_LOCK(tlb); if (tcp_log_remove_id_node(NULL, NULL, tlb, tln, &tree_locked)) { tcp_log_id_validate_tree_lock(tree_locked); if (tree_locked == TREE_WLOCKED) TCPID_TREE_WUNLOCK(); else TCPID_TREE_RUNLOCK(); tree_locked = TREE_UNLOCKED; } /* Drop the INP reference. */ INP_WLOCK(tln->tln_inp); if (!in_pcbrele_wlocked(tln->tln_inp)) INP_WUNLOCK(tln->tln_inp); /* Free the log records. */ tcp_log_free_entries(&tln->tln_entries, &tln->tln_count); /* Free the node. */ uma_zfree(tcp_log_node_zone, tln); /* Relock the expiry queue. */ TCPLOG_EXPIREQ_LOCK(); } /* * We've expired all the entries we can. Do we need to reschedule * ourselves? */ callout_deactivate(&tcp_log_expireq_callout); if (tln != NULL) { /* * Get max(now + TCP_LOG_EXPIRE_INTVL, tln->tln_expiretime) and * set the next callout to that. (This helps ensure we generally * run the callout no more often than desired.) */ expiry_limit = getsbinuptime() + TCP_LOG_EXPIRE_INTVL; if (expiry_limit < tln->tln_expiretime) expiry_limit = tln->tln_expiretime; callout_reset_sbt(&tcp_log_expireq_callout, expiry_limit, SBT_1S, tcp_log_expire, NULL, C_ABSOLUTE); } /* We're done. */ TCPLOG_EXPIREQ_UNLOCK(); return; } /* * Move log data from the TCPCB to a new node. This will reset the TCPCB log * entries and log count; however, it will not touch other things from the * TCPCB (e.g. t_lin, t_lib). * * NOTE: Must hold a lock on the INP. */ static void tcp_log_move_tp_to_node(struct tcpcb *tp, struct tcp_log_id_node *tln) { INP_WLOCK_ASSERT(tp->t_inpcb); tln->tln_ie = tp->t_inpcb->inp_inc.inc_ie; if (tp->t_inpcb->inp_inc.inc_flags & INC_ISIPV6) tln->tln_af = AF_INET6; else tln->tln_af = AF_INET; tln->tln_entries = tp->t_logs; tln->tln_count = tp->t_lognum; tln->tln_bucket = tp->t_lib; /* Clear information from the PCB. */ STAILQ_INIT(&tp->t_logs); tp->t_lognum = 0; } /* Do per-TCPCB cleanup */ void tcp_log_tcpcbfini(struct tcpcb *tp) { struct tcp_log_id_node *tln, *tln_first; struct tcp_log_mem *log_entry; sbintime_t callouttime; INP_WLOCK_ASSERT(tp->t_inpcb); TCP_LOG_EVENT(tp, NULL, NULL, NULL, TCP_LOG_CONNEND, 0, 0, NULL, false); /* * If we were gathering packets to be automatically dumped, try to do * it now. If this succeeds, the log information in the TCPCB will be * cleared. Otherwise, we'll handle the log information as we do * for other states. */ switch(tp->t_logstate) { case TCP_LOG_STATE_HEAD_AUTO: (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from head", M_NOWAIT, false); break; case TCP_LOG_STATE_TAIL_AUTO: (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from tail", M_NOWAIT, false); break; case TCP_LOG_STATE_CONTINUAL: (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual", M_NOWAIT, false); break; } /* * There are two ways we could keep logs: per-socket or per-ID. If * we are tracking logs with an ID, then the logs survive the * destruction of the TCPCB. * * If the TCPCB is associated with an ID node, move the logs from the * TCPCB to the ID node. In theory, this is safe, for reasons which I * will now explain for my own benefit when I next need to figure out * this code. :-) * * We own the INP lock. Therefore, no one else can change the contents * of this node (Rule C). Further, no one can remove this node from * the bucket while we hold the lock (Rule D). Basically, no one can * mess with this node. That leaves two states in which we could be: * * 1. Another thread is currently waiting to acquire the INP lock, with * plans to do something with this node. When we drop the INP lock, * they will have a chance to do that. They will recheck the * tln_closed field (see note to Rule C) and then acquire the * bucket lock before proceeding further. * * 2. Another thread will try to acquire a lock at some point in the * future. If they try to acquire a lock before we set the * tln_closed field, they will follow state #1. If they try to * acquire a lock after we set the tln_closed field, they will be * able to make changes to the node, at will, following Rule C. * * Therefore, we currently own this node and can make any changes * we want. But, as soon as we set the tln_closed field to true, we * have effectively dropped our lock on the node. (For this reason, we * also need to make sure our writes are ordered correctly. An atomic * operation with "release" semantics should be sufficient.) */ if (tp->t_lin != NULL) { /* Copy the relevant information to the log entry. */ tln = tp->t_lin; KASSERT(tln->tln_inp == tp->t_inpcb, ("%s: Mismatched inp (tln->tln_inp=%p, tp->t_inpcb=%p)", __func__, tln->tln_inp, tp->t_inpcb)); tcp_log_move_tp_to_node(tp, tln); /* Clear information from the PCB. */ tp->t_lin = NULL; tp->t_lib = NULL; /* * Take a reference on the INP. This ensures that the INP * remains valid while the node is on the expiry queue. This * ensures the INP is valid for other threads that may be * racing to lock this node when we move it to the expire * queue. */ in_pcbref(tp->t_inpcb); /* * Store the entry on the expiry list. The exact behavior * depends on whether we have entries to keep. If so, we * put the entry at the tail of the list and expire in * TCP_LOG_EXPIRE_TIME. Otherwise, we expire "now" and put * the entry at the head of the list. (Handling the cleanup * via the expiry timer lets us avoid locking messy-ness here.) */ tln->tln_expiretime = getsbinuptime(); TCPLOG_EXPIREQ_LOCK(); if (tln->tln_count) { tln->tln_expiretime += TCP_LOG_EXPIRE_TIME; if (STAILQ_EMPTY(&tcp_log_expireq_head) && !callout_active(&tcp_log_expireq_callout)) { /* * We are adding the first entry and a callout * is not currently scheduled; therefore, we * need to schedule one. */ callout_reset_sbt(&tcp_log_expireq_callout, tln->tln_expiretime, SBT_1S, tcp_log_expire, NULL, C_ABSOLUTE); } STAILQ_INSERT_TAIL(&tcp_log_expireq_head, tln, tln_expireq); } else { callouttime = tln->tln_expiretime + TCP_LOG_EXPIRE_INTVL; tln_first = STAILQ_FIRST(&tcp_log_expireq_head); if ((tln_first == NULL || callouttime < tln_first->tln_expiretime) && (callout_pending(&tcp_log_expireq_callout) || !callout_active(&tcp_log_expireq_callout))) { /* * The list is empty, or we want to run the * expire code before the first entry's timer * fires. Also, we are in a case where a callout * is not actively running. We want to reset * the callout to occur sooner. */ callout_reset_sbt(&tcp_log_expireq_callout, callouttime, SBT_1S, tcp_log_expire, NULL, C_ABSOLUTE); } /* * Insert to the head, or just after the head, as * appropriate. (This might result in small * mis-orderings as a bunch of "expire now" entries * gather at the start of the list, but that should * not produce big problems, since the expire timer * will walk through all of them.) */ if (tln_first == NULL || tln->tln_expiretime < tln_first->tln_expiretime) STAILQ_INSERT_HEAD(&tcp_log_expireq_head, tln, tln_expireq); else STAILQ_INSERT_AFTER(&tcp_log_expireq_head, tln_first, tln, tln_expireq); } TCPLOG_EXPIREQ_UNLOCK(); /* * We are done messing with the tln. After this point, we * can't touch it. (Note that the "release" semantics should * be included with the TCPLOG_EXPIREQ_UNLOCK() call above. * Therefore, they should be unnecessary here. However, it * seems like a good idea to include them anyway, since we * really are releasing a lock here.) */ atomic_store_rel_int(&tln->tln_closed, 1); } else { /* Remove log entries. */ while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL) tcp_log_remove_log_head(tp, log_entry); KASSERT(tp->t_lognum == 0, ("%s: After freeing entries, tp->t_lognum=%d (expected 0)", __func__, tp->t_lognum)); } /* * Change the log state to off (just in case anything tries to sneak * in a last-minute log). */ tp->t_logstate = TCP_LOG_STATE_OFF; } static void tcp_log_purge_tp_logbuf(struct tcpcb *tp) { struct tcp_log_mem *log_entry; struct inpcb *inp; inp = tp->t_inpcb; INP_WLOCK_ASSERT(inp); if (tp->t_lognum == 0) return; while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL) tcp_log_remove_log_head(tp, log_entry); KASSERT(tp->t_lognum == 0, ("%s: After freeing entries, tp->t_lognum=%d (expected 0)", __func__, tp->t_lognum)); tp->t_logstate = TCP_LOG_STATE_OFF; } /* * This logs an event for a TCP socket. Normally, this is called via * TCP_LOG_EVENT or TCP_LOG_EVENT_VERBOSE. See the documentation for * TCP_LOG_EVENT(). */ struct tcp_log_buffer * tcp_log_event_(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf, struct sockbuf *txbuf, uint8_t eventid, int errornum, uint32_t len, union tcp_log_stackspecific *stackinfo, int th_hostorder, const char *output_caller, const char *func, int line, const struct timeval *itv) { struct tcp_log_mem *log_entry; struct tcp_log_buffer *log_buf; int attempt_count = 0; struct tcp_log_verbose *log_verbose; uint32_t logsn; KASSERT((func == NULL && line == 0) || (func != NULL && line > 0), ("%s called with inconsistent func (%p) and line (%d) arguments", __func__, func, line)); INP_WLOCK_ASSERT(tp->t_inpcb); if (tcp_disable_all_bb_logs) { /* * The global shutdown logging * switch has been thrown. Call * the purge function that frees * purges out the logs and * turns off logging. */ tcp_log_purge_tp_logbuf(tp); return (NULL); } KASSERT(tp->t_logstate == TCP_LOG_STATE_HEAD || tp->t_logstate == TCP_LOG_STATE_TAIL || tp->t_logstate == TCP_LOG_STATE_CONTINUAL || tp->t_logstate == TCP_LOG_STATE_HEAD_AUTO || tp->t_logstate == TCP_LOG_STATE_TAIL_AUTO, ("%s called with unexpected tp->t_logstate (%d)", __func__, tp->t_logstate)); /* * Get the serial number. We do this early so it will * increment even if we end up skipping the log entry for some * reason. */ logsn = tp->t_logsn++; /* * Can we get a new log entry? If so, increment the lognum counter * here. */ retry: if (tp->t_lognum < tp->t_loglimit) { if ((log_entry = uma_zalloc(tcp_log_zone, M_NOWAIT)) != NULL) tp->t_lognum++; } else log_entry = NULL; /* Do we need to try to reuse? */ if (log_entry == NULL) { /* * Sacrifice auto-logged sessions without a log ID if * tcp_log_auto_all is false. (If they don't have a log * ID by now, it is probable that either they won't get one * or we are resource-constrained.) */ if (tp->t_lib == NULL && (tp->t_flags2 & TF2_LOG_AUTO) && !tcp_log_auto_all) { if (tcp_log_state_change(tp, TCP_LOG_STATE_CLEAR)) { #ifdef INVARIANTS panic("%s:%d: tcp_log_state_change() failed " "to set tp %p to TCP_LOG_STATE_CLEAR", __func__, __LINE__, tp); #endif tp->t_logstate = TCP_LOG_STATE_OFF; } return (NULL); } /* * If we are in TCP_LOG_STATE_HEAD_AUTO state, try to dump * the buffers. If successful, deactivate tracing. Otherwise, * leave it active so we will retry. */ if (tp->t_logstate == TCP_LOG_STATE_HEAD_AUTO && !tcp_log_dump_tp_logbuf(tp, "auto-dumped from head", M_NOWAIT, false)) { tp->t_logstate = TCP_LOG_STATE_OFF; return(NULL); } else if ((tp->t_logstate == TCP_LOG_STATE_CONTINUAL) && !tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual", M_NOWAIT, false)) { if (attempt_count == 0) { attempt_count++; goto retry; } #ifdef TCPLOG_DEBUG_COUNTERS counter_u64_add(tcp_log_que_fail4, 1); #endif return(NULL); } else if (tp->t_logstate == TCP_LOG_STATE_HEAD_AUTO) return(NULL); /* If in HEAD state, just deactivate the tracing and return. */ if (tp->t_logstate == TCP_LOG_STATE_HEAD) { tp->t_logstate = TCP_LOG_STATE_OFF; return(NULL); } /* * Get a buffer to reuse. If that fails, just give up. * (We can't log anything without a buffer in which to * put it.) * * Note that we don't change the t_lognum counter * here. Because we are re-using the buffer, the total * number won't change. */ if ((log_entry = STAILQ_FIRST(&tp->t_logs)) == NULL) return(NULL); STAILQ_REMOVE_HEAD(&tp->t_logs, tlm_queue); tcp_log_entry_refcnt_rem(log_entry); } KASSERT(log_entry != NULL, ("%s: log_entry unexpectedly NULL", __func__)); /* Extract the log buffer and verbose buffer pointers. */ log_buf = &log_entry->tlm_buf; log_verbose = &log_entry->tlm_v; /* Basic entries. */ if (itv == NULL) getmicrouptime(&log_buf->tlb_tv); else memcpy(&log_buf->tlb_tv, itv, sizeof(struct timeval)); log_buf->tlb_ticks = ticks; log_buf->tlb_sn = logsn; log_buf->tlb_stackid = tp->t_fb->tfb_id; log_buf->tlb_eventid = eventid; log_buf->tlb_eventflags = 0; log_buf->tlb_errno = errornum; /* Socket buffers */ if (rxbuf != NULL) { log_buf->tlb_eventflags |= TLB_FLAG_RXBUF; log_buf->tlb_rxbuf.tls_sb_acc = rxbuf->sb_acc; log_buf->tlb_rxbuf.tls_sb_ccc = rxbuf->sb_ccc; log_buf->tlb_rxbuf.tls_sb_spare = 0; } if (txbuf != NULL) { log_buf->tlb_eventflags |= TLB_FLAG_TXBUF; log_buf->tlb_txbuf.tls_sb_acc = txbuf->sb_acc; log_buf->tlb_txbuf.tls_sb_ccc = txbuf->sb_ccc; log_buf->tlb_txbuf.tls_sb_spare = 0; } /* Copy values from tp to the log entry. */ #define COPY_STAT(f) log_buf->tlb_ ## f = tp->f #define COPY_STAT_T(f) log_buf->tlb_ ## f = tp->t_ ## f COPY_STAT_T(state); COPY_STAT_T(starttime); COPY_STAT(iss); COPY_STAT_T(flags); COPY_STAT(snd_una); COPY_STAT(snd_max); COPY_STAT(snd_cwnd); COPY_STAT(snd_nxt); COPY_STAT(snd_recover); COPY_STAT(snd_wnd); COPY_STAT(snd_ssthresh); COPY_STAT_T(srtt); COPY_STAT_T(rttvar); COPY_STAT(rcv_up); COPY_STAT(rcv_adv); COPY_STAT(rcv_nxt); - COPY_STAT(sack_newdata); COPY_STAT(rcv_wnd); COPY_STAT_T(dupacks); COPY_STAT_T(segqlen); COPY_STAT(snd_numholes); COPY_STAT(snd_scale); COPY_STAT(rcv_scale); #undef COPY_STAT #undef COPY_STAT_T log_buf->tlb_flex1 = 0; log_buf->tlb_flex2 = 0; /* Copy stack-specific info. */ if (stackinfo != NULL) { memcpy(&log_buf->tlb_stackinfo, stackinfo, sizeof(log_buf->tlb_stackinfo)); log_buf->tlb_eventflags |= TLB_FLAG_STACKINFO; } /* The packet */ log_buf->tlb_len = len; if (th) { int optlen; log_buf->tlb_eventflags |= TLB_FLAG_HDR; log_buf->tlb_th = *th; if (th_hostorder) tcp_fields_to_net(&log_buf->tlb_th); optlen = (th->th_off << 2) - sizeof (struct tcphdr); if (optlen > 0) memcpy(log_buf->tlb_opts, th + 1, optlen); } /* Verbose information */ if (func != NULL) { log_buf->tlb_eventflags |= TLB_FLAG_VERBOSE; if (output_caller != NULL) strlcpy(log_verbose->tlv_snd_frm, output_caller, TCP_FUNC_LEN); else *log_verbose->tlv_snd_frm = 0; strlcpy(log_verbose->tlv_trace_func, func, TCP_FUNC_LEN); log_verbose->tlv_trace_line = line; } /* Insert the new log at the tail. */ STAILQ_INSERT_TAIL(&tp->t_logs, log_entry, tlm_queue); tcp_log_entry_refcnt_add(log_entry); return (log_buf); } /* * Change the logging state for a TCPCB. Returns 0 on success or an * error code on failure. */ int tcp_log_state_change(struct tcpcb *tp, int state) { struct tcp_log_mem *log_entry; INP_WLOCK_ASSERT(tp->t_inpcb); switch(state) { case TCP_LOG_STATE_CLEAR: while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL) tcp_log_remove_log_head(tp, log_entry); /* Fall through */ case TCP_LOG_STATE_OFF: tp->t_logstate = TCP_LOG_STATE_OFF; break; case TCP_LOG_STATE_TAIL: case TCP_LOG_STATE_HEAD: case TCP_LOG_STATE_CONTINUAL: case TCP_LOG_STATE_HEAD_AUTO: case TCP_LOG_STATE_TAIL_AUTO: tp->t_logstate = state; break; default: return (EINVAL); } if (tcp_disable_all_bb_logs) { /* We are prohibited from doing any logs */ tp->t_logstate = TCP_LOG_STATE_OFF; } tp->t_flags2 &= ~(TF2_LOG_AUTO); return (0); } /* If tcp_drain() is called, flush half the log entries. */ void tcp_log_drain(struct tcpcb *tp) { struct tcp_log_mem *log_entry, *next; int target, skip; INP_WLOCK_ASSERT(tp->t_inpcb); if ((target = tp->t_lognum / 2) == 0) return; /* * If we are logging the "head" packets, we want to discard * from the tail of the queue. Otherwise, we want to discard * from the head. */ if (tp->t_logstate == TCP_LOG_STATE_HEAD || tp->t_logstate == TCP_LOG_STATE_HEAD_AUTO) { skip = tp->t_lognum - target; STAILQ_FOREACH(log_entry, &tp->t_logs, tlm_queue) if (!--skip) break; KASSERT(log_entry != NULL, ("%s: skipped through all entries!", __func__)); if (log_entry == NULL) return; while ((next = STAILQ_NEXT(log_entry, tlm_queue)) != NULL) { STAILQ_REMOVE_AFTER(&tp->t_logs, log_entry, tlm_queue); tcp_log_entry_refcnt_rem(next); tcp_log_remove_log_cleanup(tp, next); #ifdef INVARIANTS target--; #endif } KASSERT(target == 0, ("%s: After removing from tail, target was %d", __func__, target)); } else if (tp->t_logstate == TCP_LOG_STATE_CONTINUAL) { (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual", M_NOWAIT, false); } else { while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL && target--) tcp_log_remove_log_head(tp, log_entry); KASSERT(target <= 0, ("%s: After removing from head, target was %d", __func__, target)); KASSERT(tp->t_lognum > 0, ("%s: After removing from head, tp->t_lognum was %d", __func__, target)); KASSERT(log_entry != NULL, ("%s: After removing from head, the tailq was empty", __func__)); } } static inline int tcp_log_copyout(struct sockopt *sopt, void *src, void *dst, size_t len) { if (sopt->sopt_td != NULL) return (copyout(src, dst, len)); bcopy(src, dst, len); return (0); } static int tcp_log_logs_to_buf(struct sockopt *sopt, struct tcp_log_stailq *log_tailqp, struct tcp_log_buffer **end, int count) { struct tcp_log_buffer *out_entry; struct tcp_log_mem *log_entry; size_t entrysize; int error; #ifdef INVARIANTS int orig_count = count; #endif /* Copy the data out. */ error = 0; out_entry = (struct tcp_log_buffer *) sopt->sopt_val; STAILQ_FOREACH(log_entry, log_tailqp, tlm_queue) { count--; KASSERT(count >= 0, ("%s:%d: Exceeded expected count (%d) processing list %p", __func__, __LINE__, orig_count, log_tailqp)); #ifdef TCPLOG_DEBUG_COUNTERS counter_u64_add(tcp_log_que_copyout, 1); #endif /* * Skip copying out the header if it isn't present. * Instead, copy out zeros (to ensure we don't leak info). * TODO: Make sure we truly do zero everything we don't * explicitly set. */ if (log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_HDR) entrysize = sizeof(struct tcp_log_buffer); else entrysize = offsetof(struct tcp_log_buffer, tlb_th); error = tcp_log_copyout(sopt, &log_entry->tlm_buf, out_entry, entrysize); if (error) break; if (!(log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_HDR)) { error = tcp_log_copyout(sopt, zerobuf, ((uint8_t *)out_entry) + entrysize, sizeof(struct tcp_log_buffer) - entrysize); } /* * Copy out the verbose bit, if needed. Either way, * increment the output pointer the correct amount. */ if (log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_VERBOSE) { error = tcp_log_copyout(sopt, &log_entry->tlm_v, out_entry->tlb_verbose, sizeof(struct tcp_log_verbose)); if (error) break; out_entry = (struct tcp_log_buffer *) (((uint8_t *) (out_entry + 1)) + sizeof(struct tcp_log_verbose)); } else out_entry++; } *end = out_entry; KASSERT(error || count == 0, ("%s:%d: Less than expected count (%d) processing list %p" " (%d remain)", __func__, __LINE__, orig_count, log_tailqp, count)); return (error); } /* * Copy out the buffer. Note that we do incremental copying, so * sooptcopyout() won't work. However, the goal is to produce the same * end result as if we copied in the entire user buffer, updated it, * and then used sooptcopyout() to copy it out. * * NOTE: This should be called with a write lock on the PCB; however, * the function will drop it after it extracts the data from the TCPCB. */ int tcp_log_getlogbuf(struct sockopt *sopt, struct tcpcb *tp) { struct tcp_log_stailq log_tailq; struct tcp_log_mem *log_entry, *log_next; struct tcp_log_buffer *out_entry; struct inpcb *inp; size_t outsize, entrysize; int error, outnum; INP_WLOCK_ASSERT(tp->t_inpcb); inp = tp->t_inpcb; /* * Determine which log entries will fit in the buffer. As an * optimization, skip this if all the entries will clearly fit * in the buffer. (However, get an exact size if we are using * INVARIANTS.) */ #ifndef INVARIANTS if (sopt->sopt_valsize / (sizeof(struct tcp_log_buffer) + sizeof(struct tcp_log_verbose)) >= tp->t_lognum) { log_entry = STAILQ_LAST(&tp->t_logs, tcp_log_mem, tlm_queue); log_next = NULL; outsize = 0; outnum = tp->t_lognum; } else { #endif outsize = outnum = 0; log_entry = NULL; STAILQ_FOREACH(log_next, &tp->t_logs, tlm_queue) { entrysize = sizeof(struct tcp_log_buffer); if (log_next->tlm_buf.tlb_eventflags & TLB_FLAG_VERBOSE) entrysize += sizeof(struct tcp_log_verbose); if ((sopt->sopt_valsize - outsize) < entrysize) break; outsize += entrysize; outnum++; log_entry = log_next; } KASSERT(outsize <= sopt->sopt_valsize, ("%s: calculated output size (%zu) greater than available" "space (%zu)", __func__, outsize, sopt->sopt_valsize)); #ifndef INVARIANTS } #endif /* * Copy traditional sooptcopyout() behavior: if sopt->sopt_val * is NULL, silently skip the copy. However, in this case, we * will leave the list alone and return. Functionally, this * gives userspace a way to poll for an approximate buffer * size they will need to get the log entries. */ if (sopt->sopt_val == NULL) { INP_WUNLOCK(inp); if (outsize == 0) { outsize = outnum * (sizeof(struct tcp_log_buffer) + sizeof(struct tcp_log_verbose)); } if (sopt->sopt_valsize > outsize) sopt->sopt_valsize = outsize; return (0); } /* * Break apart the list. We'll save the ones we want to copy * out locally and remove them from the TCPCB list. We can * then drop the INPCB lock while we do the copyout. * * There are roughly three cases: * 1. There was nothing to copy out. That's easy: drop the * lock and return. * 2. We are copying out the entire list. Again, that's easy: * move the whole list. * 3. We are copying out a partial list. That's harder. We * need to update the list book-keeping entries. */ if (log_entry != NULL && log_next == NULL) { /* Move entire list. */ KASSERT(outnum == tp->t_lognum, ("%s:%d: outnum (%d) should match tp->t_lognum (%d)", __func__, __LINE__, outnum, tp->t_lognum)); log_tailq = tp->t_logs; tp->t_lognum = 0; STAILQ_INIT(&tp->t_logs); } else if (log_entry != NULL) { /* Move partial list. */ KASSERT(outnum < tp->t_lognum, ("%s:%d: outnum (%d) not less than tp->t_lognum (%d)", __func__, __LINE__, outnum, tp->t_lognum)); STAILQ_FIRST(&log_tailq) = STAILQ_FIRST(&tp->t_logs); STAILQ_FIRST(&tp->t_logs) = STAILQ_NEXT(log_entry, tlm_queue); KASSERT(STAILQ_NEXT(log_entry, tlm_queue) != NULL, ("%s:%d: tp->t_logs is unexpectedly shorter than expected" "(tp: %p, log_tailq: %p, outnum: %d, tp->t_lognum: %d)", __func__, __LINE__, tp, &log_tailq, outnum, tp->t_lognum)); STAILQ_NEXT(log_entry, tlm_queue) = NULL; log_tailq.stqh_last = &STAILQ_NEXT(log_entry, tlm_queue); tp->t_lognum -= outnum; } else STAILQ_INIT(&log_tailq); /* Drop the PCB lock. */ INP_WUNLOCK(inp); /* Copy the data out. */ error = tcp_log_logs_to_buf(sopt, &log_tailq, &out_entry, outnum); if (error) { /* Restore list */ INP_WLOCK(inp); if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0) { tp = intotcpcb(inp); /* Merge the two lists. */ STAILQ_CONCAT(&log_tailq, &tp->t_logs); tp->t_logs = log_tailq; tp->t_lognum += outnum; } INP_WUNLOCK(inp); } else { /* Sanity check entries */ KASSERT(((caddr_t)out_entry - (caddr_t)sopt->sopt_val) == outsize, ("%s: Actual output size (%zu) != " "calculated output size (%zu)", __func__, (size_t)((caddr_t)out_entry - (caddr_t)sopt->sopt_val), outsize)); /* Free the entries we just copied out. */ STAILQ_FOREACH_SAFE(log_entry, &log_tailq, tlm_queue, log_next) { tcp_log_entry_refcnt_rem(log_entry); uma_zfree(tcp_log_zone, log_entry); } } sopt->sopt_valsize = (size_t)((caddr_t)out_entry - (caddr_t)sopt->sopt_val); return (error); } static void tcp_log_free_queue(struct tcp_log_dev_queue *param) { struct tcp_log_dev_log_queue *entry; KASSERT(param != NULL, ("%s: called with NULL param", __func__)); if (param == NULL) return; entry = (struct tcp_log_dev_log_queue *)param; /* Free the entries. */ tcp_log_free_entries(&entry->tldl_entries, &entry->tldl_count); /* Free the buffer, if it is allocated. */ if (entry->tldl_common.tldq_buf != NULL) free(entry->tldl_common.tldq_buf, M_TCPLOGDEV); /* Free the queue entry. */ free(entry, M_TCPLOGDEV); } static struct tcp_log_common_header * tcp_log_expandlogbuf(struct tcp_log_dev_queue *param) { struct tcp_log_dev_log_queue *entry; struct tcp_log_header *hdr; uint8_t *end; struct sockopt sopt; int error; entry = (struct tcp_log_dev_log_queue *)param; /* Take a worst-case guess at space needs. */ sopt.sopt_valsize = sizeof(struct tcp_log_header) + entry->tldl_count * (sizeof(struct tcp_log_buffer) + sizeof(struct tcp_log_verbose)); hdr = malloc(sopt.sopt_valsize, M_TCPLOGDEV, M_NOWAIT); if (hdr == NULL) { #ifdef TCPLOG_DEBUG_COUNTERS counter_u64_add(tcp_log_que_fail5, entry->tldl_count); #endif return (NULL); } sopt.sopt_val = hdr + 1; sopt.sopt_valsize -= sizeof(struct tcp_log_header); sopt.sopt_td = NULL; error = tcp_log_logs_to_buf(&sopt, &entry->tldl_entries, (struct tcp_log_buffer **)&end, entry->tldl_count); if (error) { free(hdr, M_TCPLOGDEV); return (NULL); } /* Free the entries. */ tcp_log_free_entries(&entry->tldl_entries, &entry->tldl_count); entry->tldl_count = 0; memset(hdr, 0, sizeof(struct tcp_log_header)); hdr->tlh_version = TCP_LOG_BUF_VER; hdr->tlh_type = TCP_LOG_DEV_TYPE_BBR; hdr->tlh_length = end - (uint8_t *)hdr; hdr->tlh_ie = entry->tldl_ie; hdr->tlh_af = entry->tldl_af; getboottime(&hdr->tlh_offset); strlcpy(hdr->tlh_id, entry->tldl_id, TCP_LOG_ID_LEN); strlcpy(hdr->tlh_tag, entry->tldl_tag, TCP_LOG_TAG_LEN); strlcpy(hdr->tlh_reason, entry->tldl_reason, TCP_LOG_REASON_LEN); return ((struct tcp_log_common_header *)hdr); } /* * Queue the tcpcb's log buffer for transmission via the log buffer facility. * * NOTE: This should be called with a write lock on the PCB. * * how should be M_WAITOK or M_NOWAIT. If M_WAITOK, the function will drop * and reacquire the INP lock if it needs to do so. * * If force is false, this will only dump auto-logged sessions if * tcp_log_auto_all is true or if there is a log ID defined for the session. */ int tcp_log_dump_tp_logbuf(struct tcpcb *tp, char *reason, int how, bool force) { struct tcp_log_dev_log_queue *entry; struct inpcb *inp; #ifdef TCPLOG_DEBUG_COUNTERS int num_entries; #endif inp = tp->t_inpcb; INP_WLOCK_ASSERT(inp); /* If there are no log entries, there is nothing to do. */ if (tp->t_lognum == 0) return (0); /* Check for a log ID. */ if (tp->t_lib == NULL && (tp->t_flags2 & TF2_LOG_AUTO) && !tcp_log_auto_all && !force) { struct tcp_log_mem *log_entry; /* * We needed a log ID and none was found. Free the log entries * and return success. Also, cancel further logging. If the * session doesn't have a log ID by now, we'll assume it isn't * going to get one. */ while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL) tcp_log_remove_log_head(tp, log_entry); KASSERT(tp->t_lognum == 0, ("%s: After freeing entries, tp->t_lognum=%d (expected 0)", __func__, tp->t_lognum)); tp->t_logstate = TCP_LOG_STATE_OFF; return (0); } /* * Allocate memory. If we must wait, we'll need to drop the locks * and reacquire them (and do all the related business that goes * along with that). */ entry = malloc(sizeof(struct tcp_log_dev_log_queue), M_TCPLOGDEV, M_NOWAIT); if (entry == NULL && (how & M_NOWAIT)) { #ifdef TCPLOG_DEBUG_COUNTERS counter_u64_add(tcp_log_que_fail3, 1); #endif return (ENOBUFS); } if (entry == NULL) { INP_WUNLOCK(inp); entry = malloc(sizeof(struct tcp_log_dev_log_queue), M_TCPLOGDEV, M_WAITOK); INP_WLOCK(inp); /* * Note that this check is slightly overly-restrictive in * that the TCB can survive either of these events. * However, there is currently not a good way to ensure * that is the case. So, if we hit this M_WAIT path, we * may end up dropping some entries. That seems like a * small price to pay for safety. */ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { free(entry, M_TCPLOGDEV); #ifdef TCPLOG_DEBUG_COUNTERS counter_u64_add(tcp_log_que_fail2, 1); #endif return (ECONNRESET); } tp = intotcpcb(inp); if (tp->t_lognum == 0) { free(entry, M_TCPLOGDEV); return (0); } } /* Fill in the unique parts of the queue entry. */ if (tp->t_lib != NULL) { strlcpy(entry->tldl_id, tp->t_lib->tlb_id, TCP_LOG_ID_LEN); strlcpy(entry->tldl_tag, tp->t_lib->tlb_tag, TCP_LOG_TAG_LEN); } else { strlcpy(entry->tldl_id, "UNKNOWN", TCP_LOG_ID_LEN); strlcpy(entry->tldl_tag, "UNKNOWN", TCP_LOG_TAG_LEN); } if (reason != NULL) strlcpy(entry->tldl_reason, reason, TCP_LOG_REASON_LEN); else strlcpy(entry->tldl_reason, "UNKNOWN", TCP_LOG_ID_LEN); entry->tldl_ie = inp->inp_inc.inc_ie; if (inp->inp_inc.inc_flags & INC_ISIPV6) entry->tldl_af = AF_INET6; else entry->tldl_af = AF_INET; entry->tldl_entries = tp->t_logs; entry->tldl_count = tp->t_lognum; /* Fill in the common parts of the queue entry. */ entry->tldl_common.tldq_buf = NULL; entry->tldl_common.tldq_xform = tcp_log_expandlogbuf; entry->tldl_common.tldq_dtor = tcp_log_free_queue; /* Clear the log data from the TCPCB. */ #ifdef TCPLOG_DEBUG_COUNTERS num_entries = tp->t_lognum; #endif tp->t_lognum = 0; STAILQ_INIT(&tp->t_logs); /* Add the entry. If no one is listening, free the entry. */ if (tcp_log_dev_add_log((struct tcp_log_dev_queue *)entry)) { tcp_log_free_queue((struct tcp_log_dev_queue *)entry); #ifdef TCPLOG_DEBUG_COUNTERS counter_u64_add(tcp_log_que_fail1, num_entries); } else { counter_u64_add(tcp_log_queued, num_entries); #endif } return (0); } /* * Queue the log_id_node's log buffers for transmission via the log buffer * facility. * * NOTE: This should be called with the bucket locked and referenced. * * how should be M_WAITOK or M_NOWAIT. If M_WAITOK, the function will drop * and reacquire the bucket lock if it needs to do so. (The caller must * ensure that the tln is no longer on any lists so no one else will mess * with this while the lock is dropped!) */ static int tcp_log_dump_node_logbuf(struct tcp_log_id_node *tln, char *reason, int how) { struct tcp_log_dev_log_queue *entry; struct tcp_log_id_bucket *tlb; tlb = tln->tln_bucket; TCPID_BUCKET_LOCK_ASSERT(tlb); KASSERT(tlb->tlb_refcnt > 0, ("%s:%d: Called with unreferenced bucket (tln=%p, tlb=%p)", __func__, __LINE__, tln, tlb)); KASSERT(tln->tln_closed, ("%s:%d: Called for node with tln_closed==false (tln=%p)", __func__, __LINE__, tln)); /* If there are no log entries, there is nothing to do. */ if (tln->tln_count == 0) return (0); /* * Allocate memory. If we must wait, we'll need to drop the locks * and reacquire them (and do all the related business that goes * along with that). */ entry = malloc(sizeof(struct tcp_log_dev_log_queue), M_TCPLOGDEV, M_NOWAIT); if (entry == NULL && (how & M_NOWAIT)) return (ENOBUFS); if (entry == NULL) { TCPID_BUCKET_UNLOCK(tlb); entry = malloc(sizeof(struct tcp_log_dev_log_queue), M_TCPLOGDEV, M_WAITOK); TCPID_BUCKET_LOCK(tlb); } /* Fill in the common parts of the queue entry.. */ entry->tldl_common.tldq_buf = NULL; entry->tldl_common.tldq_xform = tcp_log_expandlogbuf; entry->tldl_common.tldq_dtor = tcp_log_free_queue; /* Fill in the unique parts of the queue entry. */ strlcpy(entry->tldl_id, tlb->tlb_id, TCP_LOG_ID_LEN); strlcpy(entry->tldl_tag, tlb->tlb_tag, TCP_LOG_TAG_LEN); if (reason != NULL) strlcpy(entry->tldl_reason, reason, TCP_LOG_REASON_LEN); else strlcpy(entry->tldl_reason, "UNKNOWN", TCP_LOG_ID_LEN); entry->tldl_ie = tln->tln_ie; entry->tldl_entries = tln->tln_entries; entry->tldl_count = tln->tln_count; entry->tldl_af = tln->tln_af; /* Add the entry. If no one is listening, free the entry. */ if (tcp_log_dev_add_log((struct tcp_log_dev_queue *)entry)) tcp_log_free_queue((struct tcp_log_dev_queue *)entry); return (0); } /* * Queue the log buffers for all sessions in a bucket for transmissions via * the log buffer facility. * * NOTE: This should be called with a locked bucket; however, the function * will drop the lock. */ #define LOCAL_SAVE 10 static void tcp_log_dumpbucketlogs(struct tcp_log_id_bucket *tlb, char *reason) { struct tcp_log_id_node local_entries[LOCAL_SAVE]; struct inpcb *inp; struct tcpcb *tp; struct tcp_log_id_node *cur_tln, *prev_tln, *tmp_tln; int i, num_local_entries, tree_locked; bool expireq_locked; TCPID_BUCKET_LOCK_ASSERT(tlb); /* * Take a reference on the bucket to keep it from disappearing until * we are done. */ TCPID_BUCKET_REF(tlb); /* * We'll try to create these without dropping locks. However, we * might very well need to drop locks to get memory. If that's the * case, we'll save up to 10 on the stack, and sacrifice the rest. * (Otherwise, we need to worry about finding our place again in a * potentially changed list. It just doesn't seem worth the trouble * to do that. */ expireq_locked = false; num_local_entries = 0; prev_tln = NULL; tree_locked = TREE_UNLOCKED; SLIST_FOREACH_SAFE(cur_tln, &tlb->tlb_head, tln_list, tmp_tln) { /* * If this isn't associated with a TCPCB, we can pull it off * the list now. We need to be careful that the expire timer * hasn't already taken ownership (tln_expiretime == SBT_MAX). * If so, we let the expire timer code free the data. */ if (cur_tln->tln_closed) { no_inp: /* * Get the expireq lock so we can get a consistent * read of tln_expiretime and so we can remove this * from the expireq. */ if (!expireq_locked) { TCPLOG_EXPIREQ_LOCK(); expireq_locked = true; } /* * We ignore entries with tln_expiretime == SBT_MAX. * The expire timer code already owns those. */ KASSERT(cur_tln->tln_expiretime > (sbintime_t) 0, ("%s:%d: node on the expire queue without positive " "expire time", __func__, __LINE__)); if (cur_tln->tln_expiretime == SBT_MAX) { prev_tln = cur_tln; continue; } /* Remove the entry from the expireq. */ STAILQ_REMOVE(&tcp_log_expireq_head, cur_tln, tcp_log_id_node, tln_expireq); /* Remove the entry from the bucket. */ if (prev_tln != NULL) SLIST_REMOVE_AFTER(prev_tln, tln_list); else SLIST_REMOVE_HEAD(&tlb->tlb_head, tln_list); /* * Drop the INP and bucket reference counts. Due to * lock-ordering rules, we need to drop the expire * queue lock. */ TCPLOG_EXPIREQ_UNLOCK(); expireq_locked = false; /* Drop the INP reference. */ INP_WLOCK(cur_tln->tln_inp); if (!in_pcbrele_wlocked(cur_tln->tln_inp)) INP_WUNLOCK(cur_tln->tln_inp); if (tcp_log_unref_bucket(tlb, &tree_locked, NULL)) { #ifdef INVARIANTS panic("%s: Bucket refcount unexpectedly 0.", __func__); #endif /* * Recover as best we can: free the entry we * own. */ tcp_log_free_entries(&cur_tln->tln_entries, &cur_tln->tln_count); uma_zfree(tcp_log_node_zone, cur_tln); goto done; } if (tcp_log_dump_node_logbuf(cur_tln, reason, M_NOWAIT)) { /* * If we have sapce, save the entries locally. * Otherwise, free them. */ if (num_local_entries < LOCAL_SAVE) { local_entries[num_local_entries] = *cur_tln; num_local_entries++; } else { tcp_log_free_entries( &cur_tln->tln_entries, &cur_tln->tln_count); } } /* No matter what, we are done with the node now. */ uma_zfree(tcp_log_node_zone, cur_tln); /* * Because we removed this entry from the list, prev_tln * (which tracks the previous entry still on the tlb * list) remains unchanged. */ continue; } /* * If we get to this point, the session data is still held in * the TCPCB. So, we need to pull the data out of that. * * We will need to drop the expireq lock so we can lock the INP. * We can then try to extract the data the "easy" way. If that * fails, we'll save the log entries for later. */ if (expireq_locked) { TCPLOG_EXPIREQ_UNLOCK(); expireq_locked = false; } /* Lock the INP and then re-check the state. */ inp = cur_tln->tln_inp; INP_WLOCK(inp); /* * If we caught this while it was transitioning, the data * might have moved from the TCPCB to the tln (signified by * setting tln_closed to true. If so, treat this like an * inactive connection. */ if (cur_tln->tln_closed) { /* * It looks like we may have caught this connection * while it was transitioning from active to inactive. * Treat this like an inactive connection. */ INP_WUNLOCK(inp); goto no_inp; } /* * Try to dump the data from the tp without dropping the lock. * If this fails, try to save off the data locally. */ tp = cur_tln->tln_tp; if (tcp_log_dump_tp_logbuf(tp, reason, M_NOWAIT, true) && num_local_entries < LOCAL_SAVE) { tcp_log_move_tp_to_node(tp, &local_entries[num_local_entries]); local_entries[num_local_entries].tln_closed = 1; KASSERT(local_entries[num_local_entries].tln_bucket == tlb, ("%s: %d: bucket mismatch for node %p", __func__, __LINE__, cur_tln)); num_local_entries++; } INP_WUNLOCK(inp); /* * We are goint to leave the current tln on the list. It will * become the previous tln. */ prev_tln = cur_tln; } /* Drop our locks, if any. */ KASSERT(tree_locked == TREE_UNLOCKED, ("%s: %d: tree unexpectedly locked", __func__, __LINE__)); switch (tree_locked) { case TREE_WLOCKED: TCPID_TREE_WUNLOCK(); tree_locked = TREE_UNLOCKED; break; case TREE_RLOCKED: TCPID_TREE_RUNLOCK(); tree_locked = TREE_UNLOCKED; break; } if (expireq_locked) { TCPLOG_EXPIREQ_UNLOCK(); expireq_locked = false; } /* * Try again for any saved entries. tcp_log_dump_node_logbuf() is * guaranteed to free the log entries within the node. And, since * the node itself is on our stack, we don't need to free it. */ for (i = 0; i < num_local_entries; i++) tcp_log_dump_node_logbuf(&local_entries[i], reason, M_WAITOK); /* Drop our reference. */ if (!tcp_log_unref_bucket(tlb, &tree_locked, NULL)) TCPID_BUCKET_UNLOCK(tlb); done: /* Drop our locks, if any. */ switch (tree_locked) { case TREE_WLOCKED: TCPID_TREE_WUNLOCK(); break; case TREE_RLOCKED: TCPID_TREE_RUNLOCK(); break; } if (expireq_locked) TCPLOG_EXPIREQ_UNLOCK(); } #undef LOCAL_SAVE /* * Queue the log buffers for all sessions in a bucket for transmissions via * the log buffer facility. * * NOTE: This should be called with a locked INP; however, the function * will drop the lock. */ void tcp_log_dump_tp_bucket_logbufs(struct tcpcb *tp, char *reason) { struct tcp_log_id_bucket *tlb; int tree_locked; /* Figure out our bucket and lock it. */ INP_WLOCK_ASSERT(tp->t_inpcb); tlb = tp->t_lib; if (tlb == NULL) { /* * No bucket; treat this like a request to dump a single * session's traces. */ (void)tcp_log_dump_tp_logbuf(tp, reason, M_WAITOK, true); INP_WUNLOCK(tp->t_inpcb); return; } TCPID_BUCKET_REF(tlb); INP_WUNLOCK(tp->t_inpcb); TCPID_BUCKET_LOCK(tlb); /* If we are the last reference, we have nothing more to do here. */ tree_locked = TREE_UNLOCKED; if (tcp_log_unref_bucket(tlb, &tree_locked, NULL)) { switch (tree_locked) { case TREE_WLOCKED: TCPID_TREE_WUNLOCK(); break; case TREE_RLOCKED: TCPID_TREE_RUNLOCK(); break; } return; } /* Turn this over to tcp_log_dumpbucketlogs() to finish the work. */ tcp_log_dumpbucketlogs(tlb, reason); } /* * Mark the end of a flow with the current stack. A stack can add * stack-specific info to this trace event by overriding this * function (see bbr_log_flowend() for example). */ void tcp_log_flowend(struct tcpcb *tp) { if (tp->t_logstate != TCP_LOG_STATE_OFF) { struct socket *so = tp->t_inpcb->inp_socket; TCP_LOG_EVENT(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_FLOWEND, 0, 0, NULL, false); } } Index: head/sys/netinet/tcp_log_buf.h =================================================================== --- head/sys/netinet/tcp_log_buf.h (revision 357857) +++ head/sys/netinet/tcp_log_buf.h (revision 357858) @@ -1,381 +1,380 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2016-2018 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __tcp_log_buf_h__ #define __tcp_log_buf_h__ #define TCP_LOG_REASON_LEN 32 #define TCP_LOG_TAG_LEN 32 -#define TCP_LOG_BUF_VER (7) +#define TCP_LOG_BUF_VER (8) /* * Because the (struct tcp_log_buffer) includes 8-byte uint64_t's, it requires * 8-byte alignment to work properly on all platforms. Therefore, we will * enforce 8-byte alignment for all the structures that may appear by * themselves (instead of being embedded in another structure) in a data * stream. */ #define ALIGN_TCP_LOG __aligned(8) /* Information about the socketbuffer state. */ struct tcp_log_sockbuf { uint32_t tls_sb_acc; /* available chars (sb->sb_acc) */ uint32_t tls_sb_ccc; /* claimed chars (sb->sb_ccc) */ uint32_t tls_sb_spare; /* spare */ }; /* Optional, verbose information that may be appended to an event log. */ struct tcp_log_verbose { #define TCP_FUNC_LEN 32 char tlv_snd_frm[TCP_FUNC_LEN]; /* tcp_output() caller */ char tlv_trace_func[TCP_FUNC_LEN]; /* Function that generated trace */ uint32_t tlv_trace_line; /* Line number that generated trace */ uint8_t _pad[4]; } ALIGN_TCP_LOG; /* Internal RACK state variables. */ struct tcp_log_rack { uint32_t tlr_rack_rtt; /* rc_rack_rtt */ uint8_t tlr_state; /* Internal RACK state */ uint8_t _pad[3]; /* Padding */ }; struct tcp_log_bbr { uint64_t cur_del_rate; uint64_t delRate; uint64_t rttProp; uint64_t bw_inuse; uint32_t inflight; uint32_t applimited; uint32_t delivered; uint32_t timeStamp; uint32_t epoch; uint32_t lt_epoch; uint32_t pkts_out; uint32_t flex1; uint32_t flex2; uint32_t flex3; uint32_t flex4; uint32_t flex5; uint32_t flex6; uint32_t lost; uint16_t pacing_gain; uint16_t cwnd_gain; uint16_t flex7; uint8_t bbr_state; uint8_t bbr_substate; uint8_t inhpts; uint8_t ininput; uint8_t use_lt_bw; uint8_t flex8; uint32_t pkt_epoch; }; /* Per-stack stack-specific info. */ union tcp_log_stackspecific { struct tcp_log_rack u_rack; struct tcp_log_bbr u_bbr; }; struct tcp_log_buffer { /* Event basics */ struct timeval tlb_tv; /* Timestamp of trace */ uint32_t tlb_ticks; /* Timestamp of trace */ uint32_t tlb_sn; /* Serial number */ uint8_t tlb_stackid; /* Stack ID */ uint8_t tlb_eventid; /* Event ID */ uint16_t tlb_eventflags; /* Flags for the record */ #define TLB_FLAG_RXBUF 0x0001 /* Includes receive buffer info */ #define TLB_FLAG_TXBUF 0x0002 /* Includes send buffer info */ #define TLB_FLAG_HDR 0x0004 /* Includes a TCP header */ #define TLB_FLAG_VERBOSE 0x0008 /* Includes function/line numbers */ #define TLB_FLAG_STACKINFO 0x0010 /* Includes stack-specific info */ int tlb_errno; /* Event error (if any) */ /* Internal session state */ struct tcp_log_sockbuf tlb_rxbuf; /* Receive buffer */ struct tcp_log_sockbuf tlb_txbuf; /* Send buffer */ int tlb_state; /* TCPCB t_state */ uint32_t tlb_starttime; /* TCPCB t_starttime */ uint32_t tlb_iss; /* TCPCB iss */ uint32_t tlb_flags; /* TCPCB flags */ uint32_t tlb_snd_una; /* TCPCB snd_una */ uint32_t tlb_snd_max; /* TCPCB snd_max */ uint32_t tlb_snd_cwnd; /* TCPCB snd_cwnd */ uint32_t tlb_snd_nxt; /* TCPCB snd_nxt */ uint32_t tlb_snd_recover;/* TCPCB snd_recover */ uint32_t tlb_snd_wnd; /* TCPCB snd_wnd */ uint32_t tlb_snd_ssthresh; /* TCPCB snd_ssthresh */ uint32_t tlb_srtt; /* TCPCB t_srtt */ uint32_t tlb_rttvar; /* TCPCB t_rttvar */ uint32_t tlb_rcv_up; /* TCPCB rcv_up */ uint32_t tlb_rcv_adv; /* TCPCB rcv_adv */ uint32_t tlb_rcv_nxt; /* TCPCB rcv_nxt */ - tcp_seq tlb_sack_newdata; /* TCPCB sack_newdata */ uint32_t tlb_rcv_wnd; /* TCPCB rcv_wnd */ uint32_t tlb_dupacks; /* TCPCB t_dupacks */ int tlb_segqlen; /* TCPCB segqlen */ int tlb_snd_numholes; /* TCPCB snd_numholes */ uint32_t tlb_flex1; /* Event specific information */ uint32_t tlb_flex2; /* Event specific information */ uint8_t tlb_snd_scale:4, /* TCPCB snd_scale */ tlb_rcv_scale:4; /* TCPCB rcv_scale */ uint8_t _pad[3]; /* Padding */ /* Per-stack info */ union tcp_log_stackspecific tlb_stackinfo; #define tlb_rack tlb_stackinfo.u_rack /* The packet */ uint32_t tlb_len; /* The packet's data length */ struct tcphdr tlb_th; /* The TCP header */ uint8_t tlb_opts[TCP_MAXOLEN]; /* The TCP options */ /* Verbose information (optional) */ struct tcp_log_verbose tlb_verbose[0]; } ALIGN_TCP_LOG; enum tcp_log_events { TCP_LOG_IN = 1, /* Incoming packet 1 */ TCP_LOG_OUT, /* Transmit (without other event) 2 */ TCP_LOG_RTO, /* Retransmit timeout 3 */ TCP_LOG_TF_ACK, /* Transmit due to TF_ACK 4 */ TCP_LOG_BAD_RETRAN, /* Detected bad retransmission 5 */ TCP_LOG_PRR, /* Doing PRR 6 */ TCP_LOG_REORDER, /* Detected reorder 7 */ TCP_LOG_HPTS, /* Hpts sending a packet 8 */ BBR_LOG_BBRUPD, /* We updated BBR info 9 */ BBR_LOG_BBRSND, /* We did a slot calculation and sending is done 10 */ BBR_LOG_ACKCLEAR, /* A ack clears all outstanding 11 */ BBR_LOG_INQUEUE, /* The tcb had a packet input to it 12 */ BBR_LOG_TIMERSTAR, /* Start a timer 13 */ BBR_LOG_TIMERCANC, /* Cancel a timer 14 */ BBR_LOG_ENTREC, /* Entered recovery 15 */ BBR_LOG_EXITREC, /* Exited recovery 16 */ BBR_LOG_CWND, /* Cwnd change 17 */ BBR_LOG_BWSAMP, /* LT B/W sample has been made 18 */ BBR_LOG_MSGSIZE, /* We received a EMSGSIZE error 19 */ BBR_LOG_BBRRTT, /* BBR RTT is updated 20 */ BBR_LOG_JUSTRET, /* We just returned out of output 21 */ BBR_LOG_STATE, /* A BBR state change occured 22 */ BBR_LOG_PKT_EPOCH, /* A BBR packet epoch occured 23 */ BBR_LOG_PERSIST, /* BBR changed to/from a persists 24 */ TCP_LOG_FLOWEND, /* End of a flow 25 */ BBR_LOG_RTO, /* BBR's timeout includes BBR info 26 */ BBR_LOG_DOSEG_DONE, /* hpts do_segment completes 27 */ BBR_LOG_EXIT_GAIN, /* hpts do_segment completes 28 */ BBR_LOG_THRESH_CALC, /* Doing threshold calculation 29 */ BBR_LOG_EXTRACWNDGAIN, /* Removed 30 */ TCP_LOG_USERSEND, /* User level sends data 31 */ BBR_RSM_CLEARED, /* RSM cleared of ACK flags 32 */ BBR_LOG_STATE_TARGET, /* Log of target at state 33 */ BBR_LOG_TIME_EPOCH, /* A timed based Epoch occured 34 */ BBR_LOG_TO_PROCESS, /* A to was processed 35 */ BBR_LOG_BBRTSO, /* TSO update 36 */ BBR_LOG_HPTSDIAG, /* Hpts diag insert 37 */ BBR_LOG_LOWGAIN, /* Low gain accounting 38 */ BBR_LOG_PROGRESS, /* Progress timer event 39 */ TCP_LOG_SOCKET_OPT, /* A socket option is set 40 */ BBR_LOG_TIMERPREP, /* A BBR var to debug out TLP issues 41 */ BBR_LOG_ENOBUF_JMP, /* We had a enobuf jump 42 */ BBR_LOG_HPTSI_CALC, /* calc the hptsi time 43 */ BBR_LOG_RTT_SHRINKS, /* We had a log reduction of rttProp 44 */ BBR_LOG_BW_RED_EV, /* B/W reduction events 45 */ BBR_LOG_REDUCE, /* old bbr log reduce for 4.1 and earlier 46*/ TCP_LOG_RTT, /* A rtt (in useconds) is being sampled and applied to the srtt algo 47 */ BBR_LOG_SETTINGS_CHG, /* Settings changed for loss response 48 */ BBR_LOG_SRTT_GAIN_EVENT, /* SRTT gaining -- now not used 49 */ TCP_LOG_REASS, /* Reassembly buffer logging 50 */ TCP_HDWR_TLS, /* TCP Hardware TLS logs 51 */ BBR_LOG_HDWR_PACE, /* TCP Hardware pacing log 52 */ BBR_LOG_TSTMP_VAL, /* Temp debug timestamp validation 53 */ TCP_LOG_CONNEND, /* End of connection 54 */ TCP_LOG_LRO, /* LRO entry 55 */ TCP_SACK_FILTER_RES, /* Results of SACK Filter 56 */ TCP_SAD_DETECTION, /* Sack Attack Detection 57 */ TCP_LOG_END /* End (keep at end) 58 */ }; enum tcp_log_states { TCP_LOG_STATE_CLEAR = -1, /* Deactivate and clear tracing */ TCP_LOG_STATE_OFF = 0, /* Pause */ TCP_LOG_STATE_TAIL=1, /* Keep the trailing events */ TCP_LOG_STATE_HEAD=2, /* Keep the leading events */ TCP_LOG_STATE_HEAD_AUTO=3, /* Keep the leading events, and automatically dump them to the device */ TCP_LOG_STATE_CONTINUAL=4, /* Continually dump the data when full */ TCP_LOG_STATE_TAIL_AUTO=5, /* Keep the trailing events, and automatically dump them when the session ends */ }; /* Use this if we don't know whether the operation succeeded. */ #define ERRNO_UNK (-1) /* * If the user included dev/tcp_log/tcp_log_dev.h, then include our private * headers. Otherwise, there is no reason to pollute all the files with an * additional include. * * This structure is aligned to an 8-byte boundary to match the alignment * requirements of (struct tcp_log_buffer). */ #ifdef __tcp_log_dev_h__ struct tcp_log_header { struct tcp_log_common_header tlh_common; #define tlh_version tlh_common.tlch_version #define tlh_type tlh_common.tlch_type #define tlh_length tlh_common.tlch_length struct in_endpoints tlh_ie; struct timeval tlh_offset; /* Uptime -> UTC offset */ char tlh_id[TCP_LOG_ID_LEN]; char tlh_reason[TCP_LOG_REASON_LEN]; char tlh_tag[TCP_LOG_TAG_LEN]; uint8_t tlh_af; uint8_t _pad[7]; } ALIGN_TCP_LOG; #ifdef _KERNEL struct tcp_log_dev_log_queue { struct tcp_log_dev_queue tldl_common; char tldl_id[TCP_LOG_ID_LEN]; char tldl_reason[TCP_LOG_REASON_LEN]; char tldl_tag[TCP_LOG_TAG_LEN]; struct in_endpoints tldl_ie; struct tcp_log_stailq tldl_entries; int tldl_count; uint8_t tldl_af; }; #endif /* _KERNEL */ #endif /* __tcp_log_dev_h__ */ #ifdef _KERNEL #define TCP_LOG_BUF_DEFAULT_SESSION_LIMIT 5000 #define TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT 5000000 /* * TCP_LOG_EVENT_VERBOSE: The same as TCP_LOG_EVENT, except it always * tries to record verbose information. */ #define TCP_LOG_EVENT_VERBOSE(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder, tv) \ do { \ if (tp->t_logstate != TCP_LOG_STATE_OFF) \ tcp_log_event_(tp, th, rxbuf, txbuf, eventid, \ errornum, len, stackinfo, th_hostorder, \ tp->t_output_caller, __func__, __LINE__, tv);\ } while (0) /* * TCP_LOG_EVENT: This is a macro so we can capture function/line * information when needed. * * Prototype: * TCP_LOG_EVENT(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf, * struct sockbuf *txbuf, uint8_t eventid, int errornum, * union tcp_log_stackspecific *stackinfo) * * tp is mandatory and must be write locked. * th is optional; if present, it will appear in the record. * rxbuf and txbuf are optional; if present, they will appear in the record. * eventid is mandatory. * errornum is mandatory (it indicates the success or failure of the * operation associated with the event). * len indicates the length of the packet. If no packet, use 0. * stackinfo is optional; if present, it will appear in the record. */ #ifdef TCP_LOG_FORCEVERBOSE #define TCP_LOG_EVENT TCP_LOG_EVENT_VERBOSE #else #define TCP_LOG_EVENT(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder) \ do { \ if (tcp_log_verbose) \ TCP_LOG_EVENT_VERBOSE(tp, th, rxbuf, txbuf, \ eventid, errornum, len, stackinfo, \ th_hostorder, NULL); \ else if (tp->t_logstate != TCP_LOG_STATE_OFF) \ tcp_log_event_(tp, th, rxbuf, txbuf, eventid, \ errornum, len, stackinfo, th_hostorder, \ NULL, NULL, 0, NULL); \ } while (0) #endif /* TCP_LOG_FORCEVERBOSE */ #define TCP_LOG_EVENTP(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder, tv) \ do { \ if (tp->t_logstate != TCP_LOG_STATE_OFF) \ tcp_log_event_(tp, th, rxbuf, txbuf, eventid, \ errornum, len, stackinfo, th_hostorder, \ NULL, NULL, 0, tv); \ } while (0) #ifdef TCP_BLACKBOX extern bool tcp_log_verbose; void tcp_log_drain(struct tcpcb *tp); int tcp_log_dump_tp_logbuf(struct tcpcb *tp, char *reason, int how, bool force); void tcp_log_dump_tp_bucket_logbufs(struct tcpcb *tp, char *reason); struct tcp_log_buffer *tcp_log_event_(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf, struct sockbuf *txbuf, uint8_t eventid, int errornum, uint32_t len, union tcp_log_stackspecific *stackinfo, int th_hostorder, const char *output_caller, const char *func, int line, const struct timeval *tv); size_t tcp_log_get_id(struct tcpcb *tp, char *buf); size_t tcp_log_get_tag(struct tcpcb *tp, char *buf); u_int tcp_log_get_id_cnt(struct tcpcb *tp); int tcp_log_getlogbuf(struct sockopt *sopt, struct tcpcb *tp); void tcp_log_init(void); int tcp_log_set_id(struct tcpcb *tp, char *id); int tcp_log_set_tag(struct tcpcb *tp, char *tag); int tcp_log_state_change(struct tcpcb *tp, int state); void tcp_log_tcpcbinit(struct tcpcb *tp); void tcp_log_tcpcbfini(struct tcpcb *tp); void tcp_log_flowend(struct tcpcb *tp); #else /* !TCP_BLACKBOX */ #define tcp_log_verbose (false) static inline struct tcp_log_buffer * tcp_log_event_(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf, struct sockbuf *txbuf, uint8_t eventid, int errornum, uint32_t len, union tcp_log_stackspecific *stackinfo, int th_hostorder, const char *output_caller, const char *func, int line, const struct timeval *tv) { return (NULL); } #endif /* TCP_BLACKBOX */ #endif /* _KERNEL */ #endif /* __tcp_log_buf_h__ */ Index: head/sys/netinet/tcp_output.c =================================================================== --- head/sys/netinet/tcp_output.c (revision 357857) +++ head/sys/netinet/tcp_output.c (revision 357858) @@ -1,2104 +1,2104 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_kern_tls.h" #include "opt_tcpdebug.h" #include #include #include #include #ifdef TCP_HHOOK #include #endif #include #ifdef KERN_TLS #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #endif #include #define TCPOUTFLAGS #include #include #include #include #include #include #include #include #ifdef TCPPCAP #include #endif #ifdef TCPDEBUG #include #endif #ifdef TCP_OFFLOAD #include #endif #include #include #include VNET_DEFINE(int, path_mtu_discovery) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(path_mtu_discovery), 1, "Enable Path MTU Discovery"); VNET_DEFINE(int, tcp_do_tso) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_tso), 0, "Enable TCP Segmentation Offload"); VNET_DEFINE(int, tcp_sendspace) = 1024*32; #define V_tcp_sendspace VNET(tcp_sendspace) SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sendspace), 0, "Initial send socket buffer size"); VNET_DEFINE(int, tcp_do_autosndbuf) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_autosndbuf), 0, "Enable automatic send buffer sizing"); VNET_DEFINE(int, tcp_autosndbuf_inc) = 8*1024; SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autosndbuf_inc), 0, "Incrementor step size of automatic send buffer"); VNET_DEFINE(int, tcp_autosndbuf_max) = 2*1024*1024; SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autosndbuf_max), 0, "Max size of automatic send buffer"); VNET_DEFINE(int, tcp_sendbuf_auto_lowat) = 0; #define V_tcp_sendbuf_auto_lowat VNET(tcp_sendbuf_auto_lowat) SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto_lowat, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sendbuf_auto_lowat), 0, "Modify threshold for auto send buffer growth to account for SO_SNDLOWAT"); /* * Make sure that either retransmit or persist timer is set for SYN, FIN and * non-ACK. */ #define TCP_XMIT_TIMER_ASSERT(tp, len, th_flags) \ KASSERT(((len) == 0 && ((th_flags) & (TH_SYN | TH_FIN)) == 0) ||\ tcp_timer_active((tp), TT_REXMT) || \ tcp_timer_active((tp), TT_PERSIST), \ ("neither rexmt nor persist timer is set")) static void inline cc_after_idle(struct tcpcb *tp); #ifdef TCP_HHOOK /* * Wrapper for the TCP established output helper hook. */ void hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, uint32_t len, int tso) { struct tcp_hhook_data hhook_data; if (V_tcp_hhh[HHOOK_TCP_EST_OUT]->hhh_nhooks > 0) { hhook_data.tp = tp; hhook_data.th = th; hhook_data.to = to; hhook_data.len = len; hhook_data.tso = tso; hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_OUT], &hhook_data, tp->osd); } } #endif /* * CC wrapper hook functions */ static void inline cc_after_idle(struct tcpcb *tp) { INP_WLOCK_ASSERT(tp->t_inpcb); if (CC_ALGO(tp)->after_idle != NULL) CC_ALGO(tp)->after_idle(tp->ccv); } /* * Tcp output routine: figure out what should be sent and send it. */ int tcp_output(struct tcpcb *tp) { struct socket *so = tp->t_inpcb->inp_socket; int32_t len; uint32_t recwin, sendwin; int off, flags, error = 0; /* Keep compiler happy */ u_int if_hw_tsomaxsegcount = 0; u_int if_hw_tsomaxsegsize = 0; struct mbuf *m; struct ip *ip = NULL; #ifdef TCPDEBUG struct ipovly *ipov = NULL; #endif struct tcphdr *th; u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen; #if defined(IPSEC) || defined(IPSEC_SUPPORT) unsigned ipsec_optlen = 0; #endif int idle, sendalot, curticks; int sack_rxmit, sack_bytes_rxmt; struct sackhole *p; int tso, mtu; struct tcpopt to; unsigned int wanted_cookie = 0; unsigned int dont_sendalot = 0; #if 0 int maxburst = TCP_MAXBURST; #endif #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6; isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; #endif #ifdef KERN_TLS const bool hw_tls = (so->so_snd.sb_flags & SB_TLS_IFNET) != 0; #else const bool hw_tls = false; #endif NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(tp->t_inpcb); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) return (tcp_offload_output(tp)); #endif /* * For TFO connections in SYN_SENT or SYN_RECEIVED, * only allow the initial SYN or SYN|ACK and those sent * by the retransmit timer. */ if (IS_FASTOPEN(tp->t_flags) && ((tp->t_state == TCPS_SYN_SENT) || (tp->t_state == TCPS_SYN_RECEIVED)) && SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */ (tp->snd_nxt != tp->snd_una)) /* not a retransmit */ return (0); /* * Determine length of data that should be transmitted, * and flags that will be used. * If there is some data or critical controls (SYN, RST) * to send, then transmit; otherwise, investigate further. */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); if (idle && ticks - tp->t_rcvtime >= tp->t_rxtcur) cc_after_idle(tp); tp->t_flags &= ~TF_LASTIDLE; if (idle) { if (tp->t_flags & TF_MORETOCOME) { tp->t_flags |= TF_LASTIDLE; idle = 0; } } again: /* * If we've recently taken a timeout, snd_max will be greater than * snd_nxt. There may be SACK information that allows us to avoid * resending already delivered data. Adjust snd_nxt accordingly. */ if ((tp->t_flags & TF_SACK_PERMIT) && SEQ_LT(tp->snd_nxt, tp->snd_max)) tcp_sack_adjust(tp); sendalot = 0; tso = 0; mtu = 0; off = tp->snd_nxt - tp->snd_una; sendwin = min(tp->snd_wnd, tp->snd_cwnd); flags = tcp_outflags[tp->t_state]; /* * Send any SACK-generated retransmissions. If we're explicitly trying * to send out new data (when sendalot is 1), bypass this function. * If we retransmit in fast recovery mode, decrement snd_cwnd, since * we're replacing a (future) new transmission with a retransmission * now, and we previously incremented snd_cwnd in tcp_input(). */ /* * Still in sack recovery , reset rxmit flag to zero. */ sack_rxmit = 0; sack_bytes_rxmt = 0; len = 0; p = NULL; if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags) && (p = tcp_sack_output(tp, &sack_bytes_rxmt))) { uint32_t cwin; cwin = imax(min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt, 0); /* Do not retransmit SACK segments beyond snd_recover */ if (SEQ_GT(p->end, tp->snd_recover)) { /* * (At least) part of sack hole extends beyond * snd_recover. Check to see if we can rexmit data * for this hole. */ if (SEQ_GEQ(p->rxmit, tp->snd_recover)) { /* * Can't rexmit any more data for this hole. * That data will be rexmitted in the next * sack recovery episode, when snd_recover * moves past p->rxmit. */ p = NULL; goto after_sack_rexmit; } else /* Can rexmit part of the current hole */ len = ((int32_t)ulmin(cwin, tp->snd_recover - p->rxmit)); } else len = ((int32_t)ulmin(cwin, p->end - p->rxmit)); off = p->rxmit - tp->snd_una; KASSERT(off >= 0,("%s: sack block to the left of una : %d", __func__, off)); if (len > 0) { sack_rxmit = 1; sendalot = 1; TCPSTAT_INC(tcps_sack_rexmits); TCPSTAT_ADD(tcps_sack_rexmit_bytes, min(len, tp->t_maxseg)); } } after_sack_rexmit: /* * Get standard flags, and add SYN or FIN if requested by 'hidden' * state flags. */ if (tp->t_flags & TF_NEEDFIN) flags |= TH_FIN; if (tp->t_flags & TF_NEEDSYN) flags |= TH_SYN; SOCKBUF_LOCK(&so->so_snd); /* * If in persist timeout with window of 0, send 1 byte. * Otherwise, if window is small but nonzero * and timer expired, we will send what we can * and go to transmit state. */ if (tp->t_flags & TF_FORCEDATA) { if (sendwin == 0) { /* * If we still have some data to send, then * clear the FIN bit. Usually this would * happen below when it realizes that we * aren't sending all the data. However, * if we have exactly 1 byte of unsent data, * then it won't clear the FIN bit below, * and if we are in persist state, we wind * up sending the packet without recording * that we sent the FIN bit. * * We can't just blindly clear the FIN bit, * because if we don't have any more data * to send then the probe will be the FIN * itself. */ if (off < sbused(&so->so_snd)) flags &= ~TH_FIN; sendwin = 1; } else { tcp_timer_activate(tp, TT_PERSIST, 0); tp->t_rxtshift = 0; } } /* * If snd_nxt == snd_max and we have transmitted a FIN, the * offset will be > 0 even if so_snd.sb_cc is 0, resulting in * a negative length. This can also occur when TCP opens up * its congestion window while receiving additional duplicate * acks after fast-retransmit because TCP will reset snd_nxt * to snd_max after the fast-retransmit. * * In the normal retransmit-FIN-only case, however, snd_nxt will * be set to snd_una, the offset will be 0, and the length may * wind up 0. * * If sack_rxmit is true we are retransmitting from the scoreboard * in which case len is already set. */ if (sack_rxmit == 0) { if (sack_bytes_rxmt == 0) len = ((int32_t)min(sbavail(&so->so_snd), sendwin) - off); else { int32_t cwin; /* * We are inside of a SACK recovery episode and are * sending new data, having retransmitted all the * data possible in the scoreboard. */ len = ((int32_t)min(sbavail(&so->so_snd), tp->snd_wnd) - off); /* * Don't remove this (len > 0) check ! * We explicitly check for len > 0 here (although it * isn't really necessary), to work around a gcc * optimization issue - to force gcc to compute * len above. Without this check, the computation * of len is bungled by the optimizer. */ if (len > 0) { cwin = tp->snd_cwnd - - (tp->snd_nxt - tp->sack_newdata) - + (tp->snd_nxt - tp->snd_recover) - sack_bytes_rxmt; if (cwin < 0) cwin = 0; len = imin(len, cwin); } } } /* * Lop off SYN bit if it has already been sent. However, if this * is SYN-SENT state and if segment contains data and if we don't * know that foreign host supports TAO, suppress sending segment. */ if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { if (tp->t_state != TCPS_SYN_RECEIVED) flags &= ~TH_SYN; /* * When sending additional segments following a TFO SYN|ACK, * do not include the SYN bit. */ if (IS_FASTOPEN(tp->t_flags) && (tp->t_state == TCPS_SYN_RECEIVED)) flags &= ~TH_SYN; off--, len++; } /* * Be careful not to send data and/or FIN on SYN segments. * This measure is needed to prevent interoperability problems * with not fully conformant TCP implementations. */ if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { len = 0; flags &= ~TH_FIN; } /* * On TFO sockets, ensure no data is sent in the following cases: * * - When retransmitting SYN|ACK on a passively-created socket * * - When retransmitting SYN on an actively created socket * * - When sending a zero-length cookie (cookie request) on an * actively created socket * * - When the socket is in the CLOSED state (RST is being sent) */ if (IS_FASTOPEN(tp->t_flags) && (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || ((tp->t_state == TCPS_SYN_SENT) && (tp->t_tfo_client_cookie_len == 0)) || (flags & TH_RST))) len = 0; if (len <= 0) { /* * If FIN has been sent but not acked, * but we haven't been called to retransmit, * len will be < 0. Otherwise, window shrank * after we sent into it. If window shrank to 0, * cancel pending retransmit, pull snd_nxt back * to (closed) window, and set the persist timer * if it isn't already going. If the window didn't * close completely, just wait for an ACK. * * We also do a general check here to ensure that * we will set the persist timer when we have data * to send, but a 0-byte window. This makes sure * the persist timer is set even if the packet * hits one of the "goto send" lines below. */ len = 0; if ((sendwin == 0) && (TCPS_HAVEESTABLISHED(tp->t_state)) && (off < (int) sbavail(&so->so_snd))) { tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rxtshift = 0; tp->snd_nxt = tp->snd_una; if (!tcp_timer_active(tp, TT_PERSIST)) tcp_setpersist(tp); } } /* len will be >= 0 after this point. */ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); tcp_sndbuf_autoscale(tp, so, sendwin); /* * Decide if we can use TCP Segmentation Offloading (if supported by * hardware). * * TSO may only be used if we are in a pure bulk sending state. The * presence of TCP-MD5, SACK retransmits, SACK advertizements and * IP options prevent using TSO. With TSO the TCP header is the same * (except for the sequence number) for all generated packets. This * makes it impossible to transmit any options which vary per generated * segment or packet. * * IPv4 handling has a clear separation of ip options and ip header * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does * the right thing below to provide length of just ip options and thus * checking for ipoptlen is enough to decide if ip options are present. */ #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * Pre-calculate here as we save another lookup into the darknesses * of IPsec that way and can actually decide if TSO is ok. */ #ifdef INET6 if (isipv6 && IPSEC_ENABLED(ipv6)) ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb); #ifdef INET else #endif #endif /* INET6 */ #ifdef INET if (IPSEC_ENABLED(ipv4)) ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb); #endif /* INET */ #endif /* IPSEC */ #ifdef INET6 if (isipv6) ipoptlen = ip6_optlen(tp->t_inpcb); else #endif if (tp->t_inpcb->inp_options) ipoptlen = tp->t_inpcb->inp_options->m_len - offsetof(struct ipoption, ipopt_list); else ipoptlen = 0; #if defined(IPSEC) || defined(IPSEC_SUPPORT) ipoptlen += ipsec_optlen; #endif if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg && ((tp->t_flags & TF_SIGNATURE) == 0) && tp->rcv_numsacks == 0 && sack_rxmit == 0 && ipoptlen == 0 && !(flags & TH_SYN)) tso = 1; if (sack_rxmit) { if (SEQ_LT(p->rxmit + len, tp->snd_una + sbused(&so->so_snd))) flags &= ~TH_FIN; } else { if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + sbused(&so->so_snd))) flags &= ~TH_FIN; } recwin = lmin(lmax(sbspace(&so->so_rcv), 0), (long)TCP_MAXWIN << tp->rcv_scale); /* * Sender silly window avoidance. We transmit under the following * conditions when len is non-zero: * * - We have a full segment (or more with TSO) * - This is the last buffer in a write()/send() and we are * either idle or running NODELAY * - we've timed out (e.g. persist timer) * - we have more then 1/2 the maximum send window's worth of * data (receiver may be limited the window size) * - we need to retransmit */ if (len) { if (len >= tp->t_maxseg) goto send; /* * NOTE! on localhost connections an 'ack' from the remote * end may occur synchronously with the output and cause * us to flush a buffer queued with moretocome. XXX * * note: the len + off check is almost certainly unnecessary. */ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ (idle || (tp->t_flags & TF_NODELAY)) && (uint32_t)len + (uint32_t)off >= sbavail(&so->so_snd) && (tp->t_flags & TF_NOPUSH) == 0) { goto send; } if (tp->t_flags & TF_FORCEDATA) /* typ. timeout case */ goto send; if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */ goto send; if (sack_rxmit) goto send; } /* * Sending of standalone window updates. * * Window updates are important when we close our window due to a * full socket buffer and are opening it again after the application * reads data from it. Once the window has opened again and the * remote end starts to send again the ACK clock takes over and * provides the most current window information. * * We must avoid the silly window syndrome whereas every read * from the receive buffer, no matter how small, causes a window * update to be sent. We also should avoid sending a flurry of * window updates when the socket buffer had queued a lot of data * and the application is doing small reads. * * Prevent a flurry of pointless window updates by only sending * an update when we can increase the advertized window by more * than 1/4th of the socket buffer capacity. When the buffer is * getting full or is very small be more aggressive and send an * update whenever we can increase by two mss sized segments. * In all other situations the ACK's to new incoming data will * carry further window increases. * * Don't send an independent window update if a delayed * ACK is pending (it will get piggy-backed on it) or the * remote side already has done a half-close and won't send * more data. Skip this if the connection is in T/TCP * half-open state. */ if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && !(tp->t_flags & TF_DELACK) && !TCPS_HAVERCVDFIN(tp->t_state)) { /* * "adv" is the amount we could increase the window, * taking into account that we are limited by * TCP_MAXWIN << tp->rcv_scale. */ int32_t adv; int oldwin; adv = recwin; if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { oldwin = (tp->rcv_adv - tp->rcv_nxt); adv -= oldwin; } else oldwin = 0; /* * If the new window size ends up being the same as or less * than the old size when it is scaled, then don't force * a window update. */ if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale) goto dontupdate; if (adv >= (int32_t)(2 * tp->t_maxseg) && (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || recwin <= (so->so_rcv.sb_hiwat / 8) || so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg || adv >= TCP_MAXWIN << tp->rcv_scale)) goto send; if (2 * adv >= (int32_t)so->so_rcv.sb_hiwat) goto send; } dontupdate: /* * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW * is also a catch-all for the retransmit timer timeout case. */ if (tp->t_flags & TF_ACKNOW) goto send; if ((flags & TH_RST) || ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) goto send; if (SEQ_GT(tp->snd_up, tp->snd_una)) goto send; /* * If our state indicates that FIN should be sent * and we have not yet done so, then we need to send. */ if (flags & TH_FIN && ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) goto send; /* * In SACK, it is possible for tcp_output to fail to send a segment * after the retransmission timer has been turned off. Make sure * that the retransmission timer is set. */ if ((tp->t_flags & TF_SACK_PERMIT) && SEQ_GT(tp->snd_max, tp->snd_una) && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); goto just_return; } /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window * updates. The three ``states'' for the output side are: * idle not doing retransmits or persists * persisting to move a small or zero window * (re)transmitting and thereby not persisting * * tcp_timer_active(tp, TT_PERSIST) * is true when we are in persist state. * (tp->t_flags & TF_FORCEDATA) * is set when we are called to send a persist packet. * tcp_timer_active(tp, TT_REXMT) * is set when we are retransmitting * The output side is idle when both timers are zero. * * If send window is too small, there is data to transmit, and no * retransmit or persist is pending, then go to persist state. * If nothing happens soon, send when timer expires: * if window is nonzero, transmit what we can, * otherwise force out a byte. */ if (sbavail(&so->so_snd) && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { tp->t_rxtshift = 0; tcp_setpersist(tp); } /* * No reason to send a segment, just return. */ just_return: SOCKBUF_UNLOCK(&so->so_snd); return (0); send: SOCKBUF_LOCK_ASSERT(&so->so_snd); if (len > 0) { if (len >= tp->t_maxseg) tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; else tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; } /* * Before ESTABLISHED, force sending of initial options * unless TCP set not to do any options. * NOTE: we assume that the IP/TCP header plus TCP options * always fit in a single mbuf, leaving room for a maximum * link header, i.e. * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES */ optlen = 0; #ifdef INET6 if (isipv6) hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); else #endif hdrlen = sizeof (struct tcpiphdr); /* * Compute options for segment. * We only have to care about SYN and established connection * segments. Options for SYN-ACK segments are handled in TCP * syncache. */ to.to_flags = 0; if ((tp->t_flags & TF_NOOPT) == 0) { /* Maximum segment size. */ if (flags & TH_SYN) { tp->snd_nxt = tp->iss; to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc); to.to_flags |= TOF_MSS; /* * On SYN or SYN|ACK transmits on TFO connections, * only include the TFO option if it is not a * retransmit, as the presence of the TFO option may * have caused the original SYN or SYN|ACK to have * been dropped by a middlebox. */ if (IS_FASTOPEN(tp->t_flags) && (tp->t_rxtshift == 0)) { if (tp->t_state == TCPS_SYN_RECEIVED) { to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; to.to_tfo_cookie = (u_int8_t *)&tp->t_tfo_cookie.server; to.to_flags |= TOF_FASTOPEN; wanted_cookie = 1; } else if (tp->t_state == TCPS_SYN_SENT) { to.to_tfo_len = tp->t_tfo_client_cookie_len; to.to_tfo_cookie = tp->t_tfo_cookie.client; to.to_flags |= TOF_FASTOPEN; wanted_cookie = 1; /* * If we wind up having more data to * send with the SYN than can fit in * one segment, don't send any more * until the SYN|ACK comes back from * the other end. */ dont_sendalot = 1; } } } /* Window scaling. */ if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { to.to_wscale = tp->request_r_scale; to.to_flags |= TOF_SCALE; } /* Timestamps. */ if ((tp->t_flags & TF_RCVD_TSTMP) || ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { curticks = tcp_ts_getticks(); to.to_tsval = curticks + tp->ts_offset; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; if (tp->t_rxtshift == 1) tp->t_badrxtwin = curticks; } /* Set receive buffer autosizing timestamp. */ if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE)) tp->rfbuf_ts = tcp_ts_getticks(); /* Selective ACK's. */ if (tp->t_flags & TF_SACK_PERMIT) { if (flags & TH_SYN) to.to_flags |= TOF_SACKPERM; else if (TCPS_HAVEESTABLISHED(tp->t_state) && (tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) { to.to_flags |= TOF_SACK; to.to_nsacks = tp->rcv_numsacks; to.to_sacks = (u_char *)tp->sackblks; } } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* TCP-MD5 (RFC2385). */ /* * Check that TCP_MD5SIG is enabled in tcpcb to * account the size needed to set this TCP option. */ if (tp->t_flags & TF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif /* TCP_SIGNATURE */ /* Processing the options. */ hdrlen += optlen = tcp_addoptions(&to, opt); /* * If we wanted a TFO option to be added, but it was unable * to fit, ensure no data is sent. */ if (IS_FASTOPEN(tp->t_flags) && wanted_cookie && !(to.to_flags & TOF_FASTOPEN)) len = 0; } /* * Adjust data length if insertion of options will * bump the packet length beyond the t_maxseg length. * Clear the FIN bit because we cut off the tail of * the segment. */ if (len + optlen + ipoptlen > tp->t_maxseg) { flags &= ~TH_FIN; if (tso) { u_int if_hw_tsomax; u_int moff; int max_len; /* extract TSO information */ if_hw_tsomax = tp->t_tsomax; if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; /* * Limit a TSO burst to prevent it from * overflowing or exceeding the maximum length * allowed by the network interface: */ KASSERT(ipoptlen == 0, ("%s: TSO can't do IP options", __func__)); /* * Check if we should limit by maximum payload * length: */ if (if_hw_tsomax != 0) { /* compute maximum TSO length */ max_len = (if_hw_tsomax - hdrlen - max_linkhdr); if (max_len <= 0) { len = 0; } else if (len > max_len) { sendalot = 1; len = max_len; } } /* * Prevent the last segment from being * fractional unless the send sockbuf can be * emptied: */ max_len = (tp->t_maxseg - optlen); if (((uint32_t)off + (uint32_t)len) < sbavail(&so->so_snd)) { moff = len % max_len; if (moff != 0) { len -= moff; sendalot = 1; } } /* * In case there are too many small fragments * don't use TSO: */ if (len <= max_len) { len = max_len; sendalot = 1; tso = 0; } /* * Send the FIN in a separate segment * after the bulk sending is done. * We don't trust the TSO implementations * to clear the FIN flag on all but the * last segment. */ if (tp->t_flags & TF_NEEDFIN) sendalot = 1; } else { if (optlen + ipoptlen >= tp->t_maxseg) { /* * Since we don't have enough space to put * the IP header chain and the TCP header in * one packet as required by RFC 7112, don't * send it. Also ensure that at least one * byte of the payload can be put into the * TCP segment. */ SOCKBUF_UNLOCK(&so->so_snd); error = EMSGSIZE; sack_rxmit = 0; goto out; } len = tp->t_maxseg - optlen - ipoptlen; sendalot = 1; if (dont_sendalot) sendalot = 0; } } else tso = 0; KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, ("%s: len > IP_MAXPACKET", __func__)); /*#ifdef DIAGNOSTIC*/ #ifdef INET6 if (max_linkhdr + hdrlen > MCLBYTES) #else if (max_linkhdr + hdrlen > MHLEN) #endif panic("tcphdr too big"); /*#endif*/ /* * This KASSERT is here to catch edge cases at a well defined place. * Before, those had triggered (random) panic conditions further down. */ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); /* * Grab a header mbuf, attaching a copy of data to * be transmitted, and initialize the header from * the template for sends on this connection. */ if (len) { struct mbuf *mb; struct sockbuf *msb; u_int moff; if ((tp->t_flags & TF_FORCEDATA) && len == 1) { TCPSTAT_INC(tcps_sndprobe); #ifdef STATS if (SEQ_LT(tp->snd_nxt, tp->snd_max)) stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, len); else stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, len); #endif /* STATS */ } else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { tp->t_sndrexmitpack++; TCPSTAT_INC(tcps_sndrexmitpack); TCPSTAT_ADD(tcps_sndrexmitbyte, len); #ifdef STATS stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, len); #endif /* STATS */ } else { TCPSTAT_INC(tcps_sndpack); TCPSTAT_ADD(tcps_sndbyte, len); #ifdef STATS stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, len); #endif /* STATS */ } #ifdef INET6 if (MHLEN < hdrlen + max_linkhdr) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else #endif m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOBUFS; sack_rxmit = 0; goto out; } m->m_data += max_linkhdr; m->m_len = hdrlen; /* * Start the m_copy functions from the closest mbuf * to the offset in the socket buffer chain. */ mb = sbsndptr_noadv(&so->so_snd, off, &moff); if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) { m_copydata(mb, moff, len, mtod(m, caddr_t) + hdrlen); if (SEQ_LT(tp->snd_nxt, tp->snd_max)) sbsndptr_adv(&so->so_snd, mb, len); m->m_len += len; } else { if (SEQ_LT(tp->snd_nxt, tp->snd_max)) msb = NULL; else msb = &so->so_snd; m->m_next = tcp_m_copym(mb, moff, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, hw_tls); if (len <= (tp->t_maxseg - optlen)) { /* * Must have ran out of mbufs for the copy * shorten it to no longer need tso. Lets * not put on sendalot since we are low on * mbufs. */ tso = 0; } if (m->m_next == NULL) { SOCKBUF_UNLOCK(&so->so_snd); (void) m_free(m); error = ENOBUFS; sack_rxmit = 0; goto out; } } /* * If we're sending everything we've got, set PUSH. * (This will keep happy those implementations which only * give data to the user when a buffer fills or * a PUSH comes in.) */ if (((uint32_t)off + (uint32_t)len == sbused(&so->so_snd)) && !(flags & TH_SYN)) flags |= TH_PUSH; SOCKBUF_UNLOCK(&so->so_snd); } else { SOCKBUF_UNLOCK(&so->so_snd); if (tp->t_flags & TF_ACKNOW) TCPSTAT_INC(tcps_sndacks); else if (flags & (TH_SYN|TH_FIN|TH_RST)) TCPSTAT_INC(tcps_sndctrl); else if (SEQ_GT(tp->snd_up, tp->snd_una)) TCPSTAT_INC(tcps_sndurg); else TCPSTAT_INC(tcps_sndwinup); m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; sack_rxmit = 0; goto out; } #ifdef INET6 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && MHLEN >= hdrlen) { M_ALIGN(m, hdrlen); } else #endif m->m_data += max_linkhdr; m->m_len = hdrlen; } SOCKBUF_UNLOCK_ASSERT(&so->so_snd); m->m_pkthdr.rcvif = (struct ifnet *)0; #ifdef MAC mac_inpcb_create_mbuf(tp->t_inpcb, m); #endif #ifdef INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); tcpip_fillheaders(tp->t_inpcb, ip6, th); } else #endif /* INET6 */ { ip = mtod(m, struct ip *); #ifdef TCPDEBUG ipov = (struct ipovly *)ip; #endif th = (struct tcphdr *)(ip + 1); tcpip_fillheaders(tp->t_inpcb, ip, th); } /* * Fill in fields, remembering maximum advertised * window for use in delaying messages about window sizes. * If resending a FIN, be sure not to use a new sequence number. */ if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && tp->snd_nxt == tp->snd_max) tp->snd_nxt--; /* * If we are starting a connection, send ECN setup * SYN packet. If we are on a retransmit, we may * resend those bits a number of times as per * RFC 3168. */ if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) { if (tp->t_rxtshift >= 1) { if (tp->t_rxtshift <= V_tcp_ecn_maxretries) flags |= TH_ECE|TH_CWR; } else flags |= TH_ECE|TH_CWR; } if (tp->t_state == TCPS_ESTABLISHED && (tp->t_flags2 & TF2_ECN_PERMIT)) { /* * If the peer has ECN, mark data packets with * ECN capable transmission (ECT). * Ignore pure ack packets, retransmissions and window probes. */ if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && (sack_rxmit == 0) && !((tp->t_flags & TF_FORCEDATA) && len == 1)) { #ifdef INET6 if (isipv6) ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); else #endif ip->ip_tos |= IPTOS_ECN_ECT0; TCPSTAT_INC(tcps_ecn_ect0); } /* * Reply with proper ECN notifications. */ if (tp->t_flags2 & TF2_ECN_SND_CWR) { flags |= TH_CWR; tp->t_flags2 &= ~TF2_ECN_SND_CWR; } if (tp->t_flags2 & TF2_ECN_SND_ECE) flags |= TH_ECE; } /* * If we are doing retransmissions, then snd_nxt will * not reflect the first unsent octet. For ACK only * packets, we do not want the sequence number of the * retransmitted packet, we want the sequence number * of the next unsent octet. So, if there is no data * (and no SYN or FIN), use snd_max instead of snd_nxt * when filling in ti_seq. But if we are in persist * state, snd_max might reflect one byte beyond the * right edge of the window, so use snd_nxt in that * case, since we know we aren't doing a retransmission. * (retransmit and persist are mutually exclusive...) */ if (sack_rxmit == 0) { if (len || (flags & (TH_SYN|TH_FIN)) || tcp_timer_active(tp, TT_PERSIST)) th->th_seq = htonl(tp->snd_nxt); else th->th_seq = htonl(tp->snd_max); } else { th->th_seq = htonl(p->rxmit); p->rxmit += len; tp->sackhint.sack_bytes_rexmit += len; } th->th_ack = htonl(tp->rcv_nxt); if (optlen) { bcopy(opt, th + 1, optlen); th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; } th->th_flags = flags; /* * Calculate receive window. Don't shrink window, * but avoid silly window syndrome. * If a RST segment is sent, advertise a window of zero. */ if (flags & TH_RST) { recwin = 0; } else { if (recwin < (so->so_rcv.sb_hiwat / 4) && recwin < tp->t_maxseg) recwin = 0; if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && recwin < (tp->rcv_adv - tp->rcv_nxt)) recwin = (tp->rcv_adv - tp->rcv_nxt); } /* * According to RFC1323 the window field in a SYN (i.e., a * or ) segment itself is never scaled. The * case is handled in syncache. */ if (flags & TH_SYN) th->th_win = htons((u_short) (min(sbspace(&so->so_rcv), TCP_MAXWIN))); else th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); /* * Adjust the RXWIN0SENT flag - indicate that we have advertised * a 0 window. This may cause the remote transmitter to stall. This * flag tells soreceive() to disable delayed acknowledgements when * draining the buffer. This can occur if the receiver is attempting * to read more data than can be buffered prior to transmitting on * the connection. */ if (th->th_win == 0) { tp->t_sndzerowin++; tp->t_flags |= TF_RXWIN0SENT; } else tp->t_flags &= ~TF_RXWIN0SENT; if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); th->th_flags |= TH_URG; } else /* * If no urgent pointer to send, then we pull * the urgent pointer to the left edge of the send window * so that it doesn't drift into the send window on sequence * number wraparound. */ tp->snd_up = tp->snd_una; /* drag it along */ /* * Put TCP length in extended header, and then * checksum extended header and data. */ m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (to.to_flags & TOF_SIGNATURE) { /* * Calculate MD5 signature and put it into the place * determined before. * NOTE: since TCP options buffer doesn't point into * mbuf's data, calculate offset and use it. */ if (!TCPMD5_ENABLED() || (error = TCPMD5_OUTPUT(m, th, (u_char *)(th + 1) + (to.to_signature - opt))) != 0) { /* * Do not send segment if the calculation of MD5 * digest has failed. */ m_freem(m); goto out; } } #endif #ifdef INET6 if (isipv6) { /* * There is no need to fill in ip6_plen right now. * It will be filled later by ip6_output. */ m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 0); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { m->m_pkthdr.csum_flags = CSUM_TCP; th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen)); /* IP version must be set here for ipv4/ipv6 checking later */ KASSERT(ip->ip_v == IPVERSION, ("%s: IP version incorrect: %d", __func__, ip->ip_v)); } #endif /* * Enable TSO and specify the size of the segments. * The TCP pseudo header checksum is always provided. */ if (tso) { KASSERT(len > tp->t_maxseg - optlen, ("%s: len <= tso_segsz", __func__)); m->m_pkthdr.csum_flags |= CSUM_TSO; m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; } KASSERT(len + hdrlen == m_length(m, NULL), ("%s: mbuf chain shorter than expected: %d + %u != %u", __func__, len, hdrlen, m_length(m, NULL))); #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ hhook_run_tcp_est_out(tp, th, &to, len, tso); #endif #ifdef TCPDEBUG /* * Trace. */ if (so->so_options & SO_DEBUG) { u_short save = 0; #ifdef INET6 if (!isipv6) #endif { save = ipov->ih_len; ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + (th->th_off << 2) */); } tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); #ifdef INET6 if (!isipv6) #endif ipov->ih_len = save; } #endif /* TCPDEBUG */ TCP_PROBE3(debug__output, tp, th, m); /* We're getting ready to send; log now. */ TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, len, NULL, false); /* * Fill in IP length and desired time to live and * send to IP level. There should be a better way * to handle ttl and tos; we could keep them in * the template, but need a way to checksum without them. */ /* * m->m_pkthdr.len should have been set before checksum calculation, * because in6_cksum() need it. */ #ifdef INET6 if (isipv6) { /* * we separately set hoplimit for every segment, since the * user might want to change the value via setsockopt. * Also, desired default hop limit might be changed via * Neighbor Discovery. */ ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL); /* * Set the packet size here for the benefit of DTrace probes. * ip6_output() will set it properly; it's supposed to include * the option header lengths as well. */ ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) tp->t_flags2 |= TF2_PLPMTU_PMTUD; else tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; if (tp->t_state == TCPS_SYN_SENT) TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); TCP_PROBE5(send, NULL, tp, ip6, tp, th); #ifdef TCPPCAP /* Save packet, if requested. */ tcp_pcap_add(th, m, &(tp->t_outpkts)); #endif /* TODO: IPv6 IP6TOS_ECT bit on */ error = ip6_output(m, tp->t_inpcb->in6p_outputopts, &tp->t_inpcb->inp_route6, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), NULL, NULL, tp->t_inpcb); if (error == EMSGSIZE && tp->t_inpcb->inp_route6.ro_rt != NULL) mtu = tp->t_inpcb->inp_route6.ro_rt->rt_mtu; } #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET { ip->ip_len = htons(m->m_pkthdr.len); #ifdef INET6 if (tp->t_inpcb->inp_vflag & INP_IPV6PROTO) ip->ip_ttl = in6_selecthlim(tp->t_inpcb, NULL); #endif /* INET6 */ /* * If we do path MTU discovery, then we set DF on every packet. * This might not be the best thing to do according to RFC3390 * Section 2. However the tcp hostcache migitates the problem * so it affects only the first tcp connection with a host. * * NB: Don't set DF on small MTU/MSS to have a safe fallback. */ if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { ip->ip_off |= htons(IP_DF); tp->t_flags2 |= TF2_PLPMTU_PMTUD; } else { tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; } if (tp->t_state == TCPS_SYN_SENT) TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); TCP_PROBE5(send, NULL, tp, ip, tp, th); #ifdef TCPPCAP /* Save packet, if requested. */ tcp_pcap_add(th, m, &(tp->t_outpkts)); #endif error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, tp->t_inpcb); if (error == EMSGSIZE && tp->t_inpcb->inp_route.ro_rt != NULL) mtu = tp->t_inpcb->inp_route.ro_rt->rt_mtu; } #endif /* INET */ out: /* * In transmit state, time the transmission and arrange for * the retransmit. In persist state, just set snd_max. */ if ((tp->t_flags & TF_FORCEDATA) == 0 || !tcp_timer_active(tp, TT_PERSIST)) { tcp_seq startseq = tp->snd_nxt; /* * Advance snd_nxt over sequence space of this segment. */ if (flags & (TH_SYN|TH_FIN)) { if (flags & TH_SYN) tp->snd_nxt++; if (flags & TH_FIN) { tp->snd_nxt++; tp->t_flags |= TF_SENTFIN; } } if (sack_rxmit) goto timer; tp->snd_nxt += len; if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { tp->snd_max = tp->snd_nxt; /* * Time this transmission if not a retransmission and * not currently timing anything. */ if (tp->t_rtttime == 0) { tp->t_rtttime = ticks; tp->t_rtseq = startseq; TCPSTAT_INC(tcps_segstimed); } #ifdef STATS if (!(tp->t_flags & TF_GPUTINPROG) && len) { tp->t_flags |= TF_GPUTINPROG; tp->gput_seq = startseq; tp->gput_ack = startseq + ulmin(sbavail(&so->so_snd) - off, sendwin); tp->gput_ts = tcp_ts_getticks(); } #endif /* STATS */ } /* * Set retransmit timer if not currently set, * and not doing a pure ack or a keep-alive probe. * Initial value for retransmit timer is smoothed * round-trip time + 2 * round-trip time variance. * Initialize shift counter which is used for backoff * of retransmit time. */ timer: if (!tcp_timer_active(tp, TT_REXMT) && ((sack_rxmit && tp->snd_nxt != tp->snd_max) || (tp->snd_nxt != tp->snd_una))) { if (tcp_timer_active(tp, TT_PERSIST)) { tcp_timer_activate(tp, TT_PERSIST, 0); tp->t_rxtshift = 0; } tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); } else if (len == 0 && sbavail(&so->so_snd) && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { /* * Avoid a situation where we do not set persist timer * after a zero window condition. For example: * 1) A -> B: packet with enough data to fill the window * 2) B -> A: ACK for #1 + new data (0 window * advertisement) * 3) A -> B: ACK for #2, 0 len packet * * In this case, A will not activate the persist timer, * because it chose to send a packet. Unless tcp_output * is called for some other reason (delayed ack timer, * another input packet from B, socket syscall), A will * not send zero window probes. * * So, if you send a 0-length packet, but there is data * in the socket buffer, and neither the rexmt or * persist timer is already set, then activate the * persist timer. */ tp->t_rxtshift = 0; tcp_setpersist(tp); } } else { /* * Persist case, update snd_max but since we are in * persist mode (no window) we do not update snd_nxt. */ int xlen = len; if (flags & TH_SYN) ++xlen; if (flags & TH_FIN) { ++xlen; tp->t_flags |= TF_SENTFIN; } if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) tp->snd_max = tp->snd_nxt + xlen; } if ((error == 0) && (TCPS_HAVEESTABLISHED(tp->t_state) && (tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0)) { /* Clean up any DSACK's sent */ tcp_clean_dsack_blocks(tp); } if (error) { /* Record the error. */ TCP_LOG_EVENT(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, error, 0, NULL, false); /* * We know that the packet was lost, so back out the * sequence number advance, if any. * * If the error is EPERM the packet got blocked by the * local firewall. Normally we should terminate the * connection but the blocking may have been spurious * due to a firewall reconfiguration cycle. So we treat * it like a packet loss and let the retransmit timer and * timeouts do their work over time. * XXX: It is a POLA question whether calling tcp_drop right * away would be the really correct behavior instead. */ if (((tp->t_flags & TF_FORCEDATA) == 0 || !tcp_timer_active(tp, TT_PERSIST)) && ((flags & TH_SYN) == 0) && (error != EPERM)) { if (sack_rxmit) { p->rxmit -= len; tp->sackhint.sack_bytes_rexmit -= len; KASSERT(tp->sackhint.sack_bytes_rexmit >= 0, ("sackhint bytes rtx >= 0")); } else tp->snd_nxt -= len; } SOCKBUF_UNLOCK_ASSERT(&so->so_snd); /* Check gotos. */ switch (error) { case EACCES: case EPERM: tp->t_softerror = error; return (error); case ENOBUFS: TCP_XMIT_TIMER_ASSERT(tp, len, flags); tp->snd_cwnd = tp->t_maxseg; return (0); case EMSGSIZE: /* * For some reason the interface we used initially * to send segments changed to another or lowered * its MTU. * If TSO was active we either got an interface * without TSO capabilits or TSO was turned off. * If we obtained mtu from ip_output() then update * it and try again. */ if (tso) tp->t_flags &= ~TF_TSO; if (mtu != 0) { tcp_mss_update(tp, -1, mtu, NULL, NULL); goto again; } return (error); case EHOSTDOWN: case EHOSTUNREACH: case ENETDOWN: case ENETUNREACH: if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_softerror = error; return (0); } /* FALLTHROUGH */ default: return (error); } } TCPSTAT_INC(tcps_sndtotal); /* * Data sent (as far as we can tell). * If this advertises a larger window than any other segment, * then remember the size of the advertised window. * Any pending ACK has now been sent. */ if (SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) tp->rcv_adv = tp->rcv_nxt + recwin; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); if (tcp_timer_active(tp, TT_DELACK)) tcp_timer_activate(tp, TT_DELACK, 0); #if 0 /* * This completely breaks TCP if newreno is turned on. What happens * is that if delayed-acks are turned on on the receiver, this code * on the transmitter effectively destroys the TCP window, forcing * it to four packets (1.5Kx4 = 6K window). */ if (sendalot && --maxburst) goto again; #endif if (sendalot) goto again; return (0); } void tcp_setpersist(struct tcpcb *tp) { int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; int tt; tp->t_flags &= ~TF_PREVVALID; if (tcp_timer_active(tp, TT_REXMT)) panic("tcp_setpersist: retransmit pending"); /* * Start/restart persistence timer. */ TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], tcp_persmin, tcp_persmax); tcp_timer_activate(tp, TT_PERSIST, tt); if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; } /* * Insert TCP options according to the supplied parameters to the place * optp in a consistent way. Can handle unaligned destinations. * * The order of the option processing is crucial for optimal packing and * alignment for the scarce option space. * * The optimal order for a SYN/SYN-ACK segment is: * MSS (4) + NOP (1) + Window scale (3) + SACK permitted (2) + * Timestamp (10) + Signature (18) = 38 bytes out of a maximum of 40. * * The SACK options should be last. SACK blocks consume 8*n+2 bytes. * So a full size SACK blocks option is 34 bytes (with 4 SACK blocks). * At minimum we need 10 bytes (to generate 1 SACK block). If both * TCP Timestamps (12 bytes) and TCP Signatures (18 bytes) are present, * we only have 10 bytes for SACK options (40 - (12 + 18)). */ int tcp_addoptions(struct tcpopt *to, u_char *optp) { u_int32_t mask, optlen = 0; for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) { if ((to->to_flags & mask) != mask) continue; if (optlen == TCP_MAXOLEN) break; switch (to->to_flags & mask) { case TOF_MSS: while (optlen % 4) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_MAXSEG) continue; optlen += TCPOLEN_MAXSEG; *optp++ = TCPOPT_MAXSEG; *optp++ = TCPOLEN_MAXSEG; to->to_mss = htons(to->to_mss); bcopy((u_char *)&to->to_mss, optp, sizeof(to->to_mss)); optp += sizeof(to->to_mss); break; case TOF_SCALE: while (!optlen || optlen % 2 != 1) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_WINDOW) continue; optlen += TCPOLEN_WINDOW; *optp++ = TCPOPT_WINDOW; *optp++ = TCPOLEN_WINDOW; *optp++ = to->to_wscale; break; case TOF_SACKPERM: while (optlen % 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_SACK_PERMITTED) continue; optlen += TCPOLEN_SACK_PERMITTED; *optp++ = TCPOPT_SACK_PERMITTED; *optp++ = TCPOLEN_SACK_PERMITTED; break; case TOF_TS: while (!optlen || optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_TIMESTAMP) continue; optlen += TCPOLEN_TIMESTAMP; *optp++ = TCPOPT_TIMESTAMP; *optp++ = TCPOLEN_TIMESTAMP; to->to_tsval = htonl(to->to_tsval); to->to_tsecr = htonl(to->to_tsecr); bcopy((u_char *)&to->to_tsval, optp, sizeof(to->to_tsval)); optp += sizeof(to->to_tsval); bcopy((u_char *)&to->to_tsecr, optp, sizeof(to->to_tsecr)); optp += sizeof(to->to_tsecr); break; case TOF_SIGNATURE: { int siglen = TCPOLEN_SIGNATURE - 2; while (!optlen || optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_SIGNATURE) { to->to_flags &= ~TOF_SIGNATURE; continue; } optlen += TCPOLEN_SIGNATURE; *optp++ = TCPOPT_SIGNATURE; *optp++ = TCPOLEN_SIGNATURE; to->to_signature = optp; while (siglen--) *optp++ = 0; break; } case TOF_SACK: { int sackblks = 0; struct sackblk *sack = (struct sackblk *)to->to_sacks; tcp_seq sack_seq; while (!optlen || optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_SACKHDR + TCPOLEN_SACK) continue; optlen += TCPOLEN_SACKHDR; *optp++ = TCPOPT_SACK; sackblks = min(to->to_nsacks, (TCP_MAXOLEN - optlen) / TCPOLEN_SACK); *optp++ = TCPOLEN_SACKHDR + sackblks * TCPOLEN_SACK; while (sackblks--) { sack_seq = htonl(sack->start); bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq)); optp += sizeof(sack_seq); sack_seq = htonl(sack->end); bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq)); optp += sizeof(sack_seq); optlen += TCPOLEN_SACK; sack++; } TCPSTAT_INC(tcps_sack_send_blocks); break; } case TOF_FASTOPEN: { int total_len; /* XXX is there any point to aligning this option? */ total_len = TCPOLEN_FAST_OPEN_EMPTY + to->to_tfo_len; if (TCP_MAXOLEN - optlen < total_len) { to->to_flags &= ~TOF_FASTOPEN; continue; } *optp++ = TCPOPT_FAST_OPEN; *optp++ = total_len; if (to->to_tfo_len > 0) { bcopy(to->to_tfo_cookie, optp, to->to_tfo_len); optp += to->to_tfo_len; } optlen += total_len; break; } default: panic("%s: unknown TCP option type", __func__); break; } } /* Terminate and pad TCP options to a 4 byte boundary. */ if (optlen % 4) { optlen += TCPOLEN_EOL; *optp++ = TCPOPT_EOL; } /* * According to RFC 793 (STD0007): * "The content of the header beyond the End-of-Option option * must be header padding (i.e., zero)." * and later: "The padding is composed of zeros." */ while (optlen % 4) { optlen += TCPOLEN_PAD; *optp++ = TCPOPT_PAD; } KASSERT(optlen <= TCP_MAXOLEN, ("%s: TCP options too long", __func__)); return (optlen); } /* * This is a copy of m_copym(), taking the TSO segment size/limit * constraints into account, and advancing the sndptr as it goes. */ struct mbuf * tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen, int32_t seglimit, int32_t segsize, struct sockbuf *sb, bool hw_tls) { #ifdef KERN_TLS struct ktls_session *tls, *ntls; struct mbuf *start; #endif struct mbuf *n, **np; struct mbuf *top; int32_t off = off0; int32_t len = *plen; int32_t fragsize; int32_t len_cp = 0; int32_t *pkthdrlen; uint32_t mlen, frags; bool copyhdr; KASSERT(off >= 0, ("tcp_m_copym, negative off %d", off)); KASSERT(len >= 0, ("tcp_m_copym, negative len %d", len)); if (off == 0 && m->m_flags & M_PKTHDR) copyhdr = true; else copyhdr = false; while (off > 0) { KASSERT(m != NULL, ("tcp_m_copym, offset > size of mbuf chain")); if (off < m->m_len) break; off -= m->m_len; if ((sb) && (m == sb->sb_sndptr)) { sb->sb_sndptroff += m->m_len; sb->sb_sndptr = m->m_next; } m = m->m_next; } np = ⊤ top = NULL; pkthdrlen = NULL; #ifdef KERN_TLS if (m->m_flags & M_NOMAP) tls = m->m_ext.ext_pgs->tls; else tls = NULL; start = m; #endif while (len > 0) { if (m == NULL) { KASSERT(len == M_COPYALL, ("tcp_m_copym, length > size of mbuf chain")); *plen = len_cp; if (pkthdrlen != NULL) *pkthdrlen = len_cp; break; } #ifdef KERN_TLS if (hw_tls) { if (m->m_flags & M_NOMAP) ntls = m->m_ext.ext_pgs->tls; else ntls = NULL; /* * Avoid mixing TLS records with handshake * data or TLS records from different * sessions. */ if (tls != ntls) { MPASS(m != start); *plen = len_cp; if (pkthdrlen != NULL) *pkthdrlen = len_cp; break; } /* * Don't end a send in the middle of a TLS * record if it spans multiple TLS records. */ if (tls != NULL && (m != start) && len < m->m_len) { *plen = len_cp; if (pkthdrlen != NULL) *pkthdrlen = len_cp; break; } } #endif mlen = min(len, m->m_len - off); if (seglimit) { /* * For M_NOMAP mbufs, add 3 segments * + 1 in case we are crossing page boundaries * + 2 in case the TLS hdr/trailer are used * It is cheaper to just add the segments * than it is to take the cache miss to look * at the mbuf ext_pgs state in detail. */ if (m->m_flags & M_NOMAP) { fragsize = min(segsize, PAGE_SIZE); frags = 3; } else { fragsize = segsize; frags = 0; } /* Break if we really can't fit anymore. */ if ((frags + 1) >= seglimit) { *plen = len_cp; if (pkthdrlen != NULL) *pkthdrlen = len_cp; break; } /* * Reduce size if you can't copy the whole * mbuf. If we can't copy the whole mbuf, also * adjust len so the loop will end after this * mbuf. */ if ((frags + howmany(mlen, fragsize)) >= seglimit) { mlen = (seglimit - frags - 1) * fragsize; len = mlen; *plen = len_cp + len; if (pkthdrlen != NULL) *pkthdrlen = *plen; } frags += howmany(mlen, fragsize); if (frags == 0) frags++; seglimit -= frags; KASSERT(seglimit > 0, ("%s: seglimit went too low", __func__)); } if (copyhdr) n = m_gethdr(M_NOWAIT, m->m_type); else n = m_get(M_NOWAIT, m->m_type); *np = n; if (n == NULL) goto nospace; if (copyhdr) { if (!m_dup_pkthdr(n, m, M_NOWAIT)) goto nospace; if (len == M_COPYALL) n->m_pkthdr.len -= off0; else n->m_pkthdr.len = len; pkthdrlen = &n->m_pkthdr.len; copyhdr = false; } n->m_len = mlen; len_cp += n->m_len; if (m->m_flags & M_EXT) { n->m_data = m->m_data + off; mb_dupcl(n, m); } else bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), (u_int)n->m_len); if (sb && (sb->sb_sndptr == m) && ((n->m_len + off) >= m->m_len) && m->m_next) { sb->sb_sndptroff += m->m_len; sb->sb_sndptr = m->m_next; } off = 0; if (len != M_COPYALL) { len -= n->m_len; } m = m->m_next; np = &n->m_next; } return (top); nospace: m_freem(top); return (NULL); } void tcp_sndbuf_autoscale(struct tcpcb *tp, struct socket *so, uint32_t sendwin) { /* * Automatic sizing of send socket buffer. Often the send buffer * size is not optimally adjusted to the actual network conditions * at hand (delay bandwidth product). Setting the buffer size too * small limits throughput on links with high bandwidth and high * delay (eg. trans-continental/oceanic links). Setting the * buffer size too big consumes too much real kernel memory, * especially with many connections on busy servers. * * The criteria to step up the send buffer one notch are: * 1. receive window of remote host is larger than send buffer * (with a fudge factor of 5/4th); * 2. send buffer is filled to 7/8th with data (so we actually * have data to make use of it); * 3. send buffer fill has not hit maximal automatic size; * 4. our send window (slow start and cogestion controlled) is * larger than sent but unacknowledged data in send buffer. * * The remote host receive window scaling factor may limit the * growing of the send buffer before it reaches its allowed * maximum. * * It scales directly with slow start or congestion window * and does at most one step per received ACK. This fast * scaling has the drawback of growing the send buffer beyond * what is strictly necessary to make full use of a given * delay*bandwidth product. However testing has shown this not * to be much of an problem. At worst we are trading wasting * of available bandwidth (the non-use of it) for wasting some * socket buffer memory. * * TODO: Shrink send buffer during idle periods together * with congestion window. Requires another timer. Has to * wait for upcoming tcp timer rewrite. * * XXXGL: should there be used sbused() or sbavail()? */ if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { int lowat; lowat = V_tcp_sendbuf_auto_lowat ? so->so_snd.sb_lowat : 0; if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat - lowat && sbused(&so->so_snd) >= (so->so_snd.sb_hiwat / 8 * 7) - lowat && sbused(&so->so_snd) < V_tcp_autosndbuf_max && sendwin >= (sbused(&so->so_snd) - (tp->snd_nxt - tp->snd_una))) { if (!sbreserve_locked(&so->so_snd, min(so->so_snd.sb_hiwat + V_tcp_autosndbuf_inc, V_tcp_autosndbuf_max), so, curthread)) so->so_snd.sb_flags &= ~SB_AUTOSIZE; } } } Index: head/sys/netinet/tcp_sack.c =================================================================== --- head/sys/netinet/tcp_sack.c (revision 357857) +++ head/sys/netinet/tcp_sack.c (revision 357858) @@ -1,886 +1,886 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_sack.c 8.12 (Berkeley) 5/24/95 */ /*- * @@(#)COPYRIGHT 1.1 (NRL) 17 January 1995 * * NRL grants permission for redistribution and use in source and binary * forms, with or without modification, of the software and documentation * created at NRL provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgements: * This product includes software developed by the University of * California, Berkeley and its contributors. * This product includes software developed at the Information * Technology Division, US Naval Research Laboratory. * 4. Neither the name of the NRL nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * The views and conclusions contained in the software and documentation * are those of the authors and should not be interpreted as representing * official policies, either expressed or implied, of the US Naval * Research Laboratory (NRL). */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include /* before tcp_seq.h, for tcp_random18() */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ #include VNET_DECLARE(struct uma_zone *, sack_hole_zone); #define V_sack_hole_zone VNET(sack_hole_zone) SYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack, CTLFLAG_RW, 0, "TCP SACK"); VNET_DEFINE(int, tcp_do_sack) = 1; #define V_tcp_do_sack VNET(tcp_do_sack) SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_sack), 0, "Enable/Disable TCP SACK support"); VNET_DEFINE(int, tcp_sack_maxholes) = 128; SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, maxholes, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sack_maxholes), 0, "Maximum number of TCP SACK holes allowed per connection"); VNET_DEFINE(int, tcp_sack_globalmaxholes) = 65536; SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalmaxholes, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sack_globalmaxholes), 0, "Global maximum number of TCP SACK holes"); VNET_DEFINE(int, tcp_sack_globalholes) = 0; SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalholes, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(tcp_sack_globalholes), 0, "Global number of TCP SACK holes currently allocated"); /* * This function will find overlaps with the currently stored sackblocks * and add any overlap as a dsack block upfront */ void tcp_update_dsack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end) { struct sackblk head_blk,mid_blk,saved_blks[MAX_SACK_BLKS]; int i, j, n, identical; tcp_seq start, end; INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(SEQ_LT(rcv_start, rcv_end), ("rcv_start < rcv_end")); if (SEQ_LT(rcv_end, tp->rcv_nxt) || ((rcv_end == tp->rcv_nxt) && (tp->rcv_numsacks > 0 ) && (tp->sackblks[0].end == tp->rcv_nxt))) { saved_blks[0].start = rcv_start; saved_blks[0].end = rcv_end; } else { saved_blks[0].start = saved_blks[0].end = 0; } head_blk.start = head_blk.end = 0; mid_blk.start = rcv_start; mid_blk.end = rcv_end; identical = 0; for (i = 0; i < tp->rcv_numsacks; i++) { start = tp->sackblks[i].start; end = tp->sackblks[i].end; if (SEQ_LT(rcv_end, start)) { /* pkt left to sack blk */ continue; } if (SEQ_GT(rcv_start, end)) { /* pkt right to sack blk */ continue; } if (SEQ_GT(tp->rcv_nxt, end)) { if ((SEQ_MAX(rcv_start, start) != SEQ_MIN(rcv_end, end)) && (SEQ_GT(head_blk.start, SEQ_MAX(rcv_start, start)) || (head_blk.start == head_blk.end))) { head_blk.start = SEQ_MAX(rcv_start, start); head_blk.end = SEQ_MIN(rcv_end, end); } continue; } if (((head_blk.start == head_blk.end) || SEQ_LT(start, head_blk.start)) && (SEQ_GT(end, rcv_start) && SEQ_LEQ(start, rcv_end))) { head_blk.start = start; head_blk.end = end; } mid_blk.start = SEQ_MIN(mid_blk.start, start); mid_blk.end = SEQ_MAX(mid_blk.end, end); if ((mid_blk.start == start) && (mid_blk.end == end)) identical = 1; } if (SEQ_LT(head_blk.start, head_blk.end)) { /* store overlapping range */ saved_blks[0].start = SEQ_MAX(rcv_start, head_blk.start); saved_blks[0].end = SEQ_MIN(rcv_end, head_blk.end); } n = 1; /* * Second, if not ACKed, store the SACK block that * overlaps with the DSACK block unless it is identical */ if ((SEQ_LT(tp->rcv_nxt, mid_blk.end) && !((mid_blk.start == saved_blks[0].start) && (mid_blk.end == saved_blks[0].end))) || identical == 1) { saved_blks[n].start = mid_blk.start; saved_blks[n++].end = mid_blk.end; } for (j = 0; (j < tp->rcv_numsacks) && (n < MAX_SACK_BLKS); j++) { if (((SEQ_LT(tp->sackblks[j].end, mid_blk.start) || SEQ_GT(tp->sackblks[j].start, mid_blk.end)) && (SEQ_GT(tp->sackblks[j].start, tp->rcv_nxt)))) saved_blks[n++] = tp->sackblks[j]; } j = 0; for (i = 0; i < n; i++) { /* we can end up with a stale initial entry */ if (SEQ_LT(saved_blks[i].start, saved_blks[i].end)) { tp->sackblks[j++] = saved_blks[i]; } } tp->rcv_numsacks = j; } /* * This function is called upon receipt of new valid data (while not in * header prediction mode), and it updates the ordered list of sacks. */ void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end) { /* * First reported block MUST be the most recent one. Subsequent * blocks SHOULD be in the order in which they arrived at the * receiver. These two conditions make the implementation fully * compliant with RFC 2018. */ struct sackblk head_blk, saved_blks[MAX_SACK_BLKS]; int num_head, num_saved, i; INP_WLOCK_ASSERT(tp->t_inpcb); /* Check arguments. */ KASSERT(SEQ_LEQ(rcv_start, rcv_end), ("rcv_start <= rcv_end")); if ((rcv_start == rcv_end) && (tp->rcv_numsacks >= 1) && (rcv_end == tp->sackblks[0].end)) { /* retaining DSACK block below rcv_nxt (todrop) */ head_blk = tp->sackblks[0]; } else { /* SACK block for the received segment. */ head_blk.start = rcv_start; head_blk.end = rcv_end; } /* * Merge updated SACK blocks into head_blk, and save unchanged SACK * blocks into saved_blks[]. num_saved will have the number of the * saved SACK blocks. */ num_saved = 0; for (i = 0; i < tp->rcv_numsacks; i++) { tcp_seq start = tp->sackblks[i].start; tcp_seq end = tp->sackblks[i].end; if (SEQ_GEQ(start, end) || SEQ_LEQ(start, tp->rcv_nxt)) { /* * Discard this SACK block. */ } else if (SEQ_LEQ(head_blk.start, end) && SEQ_GEQ(head_blk.end, start)) { /* * Merge this SACK block into head_blk. This SACK * block itself will be discarded. */ /* * |-| * |---| merge * * |-| * |---| merge * * |-----| * |-| DSACK smaller * * |-| * |-----| DSACK smaller */ if (head_blk.start == end) head_blk.start = start; else if (head_blk.end == start) head_blk.end = end; else { if (SEQ_LT(head_blk.start, start)) { tcp_seq temp = start; start = head_blk.start; head_blk.start = temp; } if (SEQ_GT(head_blk.end, end)) { tcp_seq temp = end; end = head_blk.end; head_blk.end = temp; } if ((head_blk.start != start) || (head_blk.end != end)) { if ((num_saved >= 1) && SEQ_GEQ(saved_blks[num_saved-1].start, start) && SEQ_LEQ(saved_blks[num_saved-1].end, end)) num_saved--; saved_blks[num_saved].start = start; saved_blks[num_saved].end = end; num_saved++; } } } else { /* * This block supercedes the prior block */ if ((num_saved >= 1) && SEQ_GEQ(saved_blks[num_saved-1].start, start) && SEQ_LEQ(saved_blks[num_saved-1].end, end)) num_saved--; /* * Save this SACK block. */ saved_blks[num_saved].start = start; saved_blks[num_saved].end = end; num_saved++; } } /* * Update SACK list in tp->sackblks[]. */ num_head = 0; if (SEQ_LT(rcv_start, rcv_end)) { /* * The received data segment is an out-of-order segment. Put * head_blk at the top of SACK list. */ tp->sackblks[0] = head_blk; num_head = 1; /* * If the number of saved SACK blocks exceeds its limit, * discard the last SACK block. */ if (num_saved >= MAX_SACK_BLKS) num_saved--; } if ((rcv_start == rcv_end) && (rcv_start == tp->sackblks[0].end)) { num_head = 1; } if (num_saved > 0) { /* * Copy the saved SACK blocks back. */ bcopy(saved_blks, &tp->sackblks[num_head], sizeof(struct sackblk) * num_saved); } /* Save the number of SACK blocks. */ tp->rcv_numsacks = num_head + num_saved; } void tcp_clean_dsack_blocks(struct tcpcb *tp) { struct sackblk saved_blks[MAX_SACK_BLKS]; int num_saved, i; INP_WLOCK_ASSERT(tp->t_inpcb); /* * Clean up any DSACK blocks that * are in our queue of sack blocks. * */ num_saved = 0; for (i = 0; i < tp->rcv_numsacks; i++) { tcp_seq start = tp->sackblks[i].start; tcp_seq end = tp->sackblks[i].end; if (SEQ_GEQ(start, end) || SEQ_LEQ(start, tp->rcv_nxt)) { /* * Discard this D-SACK block. */ continue; } /* * Save this SACK block. */ saved_blks[num_saved].start = start; saved_blks[num_saved].end = end; num_saved++; } if (num_saved > 0) { /* * Copy the saved SACK blocks back. */ bcopy(saved_blks, &tp->sackblks[0], sizeof(struct sackblk) * num_saved); } tp->rcv_numsacks = num_saved; } /* * Delete all receiver-side SACK information. */ void tcp_clean_sackreport(struct tcpcb *tp) { int i; INP_WLOCK_ASSERT(tp->t_inpcb); tp->rcv_numsacks = 0; for (i = 0; i < MAX_SACK_BLKS; i++) tp->sackblks[i].start = tp->sackblks[i].end=0; } /* * Allocate struct sackhole. */ static struct sackhole * tcp_sackhole_alloc(struct tcpcb *tp, tcp_seq start, tcp_seq end) { struct sackhole *hole; if (tp->snd_numholes >= V_tcp_sack_maxholes || V_tcp_sack_globalholes >= V_tcp_sack_globalmaxholes) { TCPSTAT_INC(tcps_sack_sboverflow); return NULL; } hole = (struct sackhole *)uma_zalloc(V_sack_hole_zone, M_NOWAIT); if (hole == NULL) return NULL; hole->start = start; hole->end = end; hole->rxmit = start; tp->snd_numholes++; atomic_add_int(&V_tcp_sack_globalholes, 1); return hole; } /* * Free struct sackhole. */ static void tcp_sackhole_free(struct tcpcb *tp, struct sackhole *hole) { uma_zfree(V_sack_hole_zone, hole); tp->snd_numholes--; atomic_subtract_int(&V_tcp_sack_globalholes, 1); KASSERT(tp->snd_numholes >= 0, ("tp->snd_numholes >= 0")); KASSERT(V_tcp_sack_globalholes >= 0, ("tcp_sack_globalholes >= 0")); } /* * Insert new SACK hole into scoreboard. */ static struct sackhole * tcp_sackhole_insert(struct tcpcb *tp, tcp_seq start, tcp_seq end, struct sackhole *after) { struct sackhole *hole; /* Allocate a new SACK hole. */ hole = tcp_sackhole_alloc(tp, start, end); if (hole == NULL) return NULL; /* Insert the new SACK hole into scoreboard. */ if (after != NULL) TAILQ_INSERT_AFTER(&tp->snd_holes, after, hole, scblink); else TAILQ_INSERT_TAIL(&tp->snd_holes, hole, scblink); /* Update SACK hint. */ if (tp->sackhint.nexthole == NULL) tp->sackhint.nexthole = hole; return hole; } /* * Remove SACK hole from scoreboard. */ static void tcp_sackhole_remove(struct tcpcb *tp, struct sackhole *hole) { /* Update SACK hint. */ if (tp->sackhint.nexthole == hole) tp->sackhint.nexthole = TAILQ_NEXT(hole, scblink); /* Remove this SACK hole. */ TAILQ_REMOVE(&tp->snd_holes, hole, scblink); /* Free this SACK hole. */ tcp_sackhole_free(tp, hole); } /* * Process cumulative ACK and the TCP SACK option to update the scoreboard. * tp->snd_holes is an ordered list of holes (oldest to newest, in terms of * the sequence space). * Returns 1 if incoming ACK has previously unknown SACK information, * 0 otherwise. Note: We treat (snd_una, th_ack) as a sack block so any changes * to that (i.e. left edge moving) would also be considered a change in SACK * information which is slightly different than rfc6675. */ int tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) { struct sackhole *cur, *temp; struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1], *sblkp; int i, j, num_sack_blks, sack_changed; INP_WLOCK_ASSERT(tp->t_inpcb); num_sack_blks = 0; sack_changed = 0; /* * If SND.UNA will be advanced by SEG.ACK, and if SACK holes exist, * treat [SND.UNA, SEG.ACK) as if it is a SACK block. */ if (SEQ_LT(tp->snd_una, th_ack) && !TAILQ_EMPTY(&tp->snd_holes)) { sack_blocks[num_sack_blks].start = tp->snd_una; sack_blocks[num_sack_blks++].end = th_ack; } /* * Append received valid SACK blocks to sack_blocks[], but only if we * received new blocks from the other side. */ if (to->to_flags & TOF_SACK) { tp->sackhint.sacked_bytes = 0; /* reset */ for (i = 0; i < to->to_nsacks; i++) { bcopy((to->to_sacks + i * TCPOLEN_SACK), &sack, sizeof(sack)); sack.start = ntohl(sack.start); sack.end = ntohl(sack.end); if (SEQ_GT(sack.end, sack.start) && SEQ_GT(sack.start, tp->snd_una) && SEQ_GT(sack.start, th_ack) && SEQ_LT(sack.start, tp->snd_max) && SEQ_GT(sack.end, tp->snd_una) && SEQ_LEQ(sack.end, tp->snd_max)) { sack_blocks[num_sack_blks++] = sack; tp->sackhint.sacked_bytes += (sack.end-sack.start); } } } /* * Return if SND.UNA is not advanced and no valid SACK block is * received. */ if (num_sack_blks == 0) return (sack_changed); /* * Sort the SACK blocks so we can update the scoreboard with just one * pass. The overhead of sorting up to 4+1 elements is less than * making up to 4+1 passes over the scoreboard. */ for (i = 0; i < num_sack_blks; i++) { for (j = i + 1; j < num_sack_blks; j++) { if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { sack = sack_blocks[i]; sack_blocks[i] = sack_blocks[j]; sack_blocks[j] = sack; } } } if (TAILQ_EMPTY(&tp->snd_holes)) /* * Empty scoreboard. Need to initialize snd_fack (it may be * uninitialized or have a bogus value). Scoreboard holes * (from the sack blocks received) are created later below * (in the logic that adds holes to the tail of the * scoreboard). */ tp->snd_fack = SEQ_MAX(tp->snd_una, th_ack); /* * In the while-loop below, incoming SACK blocks (sack_blocks[]) and * SACK holes (snd_holes) are traversed from their tails with just * one pass in order to reduce the number of compares especially when * the bandwidth-delay product is large. * * Note: Typically, in the first RTT of SACK recovery, the highest * three or four SACK blocks with the same ack number are received. * In the second RTT, if retransmitted data segments are not lost, * the highest three or four SACK blocks with ack number advancing * are received. */ sblkp = &sack_blocks[num_sack_blks - 1]; /* Last SACK block */ tp->sackhint.last_sack_ack = sblkp->end; if (SEQ_LT(tp->snd_fack, sblkp->start)) { /* * The highest SACK block is beyond fack. Append new SACK * hole at the tail. If the second or later highest SACK * blocks are also beyond the current fack, they will be * inserted by way of hole splitting in the while-loop below. */ temp = tcp_sackhole_insert(tp, tp->snd_fack,sblkp->start,NULL); if (temp != NULL) { tp->snd_fack = sblkp->end; /* Go to the previous sack block. */ sblkp--; sack_changed = 1; } else { /* * We failed to add a new hole based on the current * sack block. Skip over all the sack blocks that * fall completely to the right of snd_fack and * proceed to trim the scoreboard based on the * remaining sack blocks. This also trims the * scoreboard for th_ack (which is sack_blocks[0]). */ while (sblkp >= sack_blocks && SEQ_LT(tp->snd_fack, sblkp->start)) sblkp--; if (sblkp >= sack_blocks && SEQ_LT(tp->snd_fack, sblkp->end)) tp->snd_fack = sblkp->end; } } else if (SEQ_LT(tp->snd_fack, sblkp->end)) { /* fack is advanced. */ tp->snd_fack = sblkp->end; sack_changed = 1; } cur = TAILQ_LAST(&tp->snd_holes, sackhole_head); /* Last SACK hole. */ /* * Since the incoming sack blocks are sorted, we can process them * making one sweep of the scoreboard. */ while (sblkp >= sack_blocks && cur != NULL) { if (SEQ_GEQ(sblkp->start, cur->end)) { /* * SACKs data beyond the current hole. Go to the * previous sack block. */ sblkp--; continue; } if (SEQ_LEQ(sblkp->end, cur->start)) { /* * SACKs data before the current hole. Go to the * previous hole. */ cur = TAILQ_PREV(cur, sackhole_head, scblink); continue; } tp->sackhint.sack_bytes_rexmit -= (cur->rxmit - cur->start); KASSERT(tp->sackhint.sack_bytes_rexmit >= 0, ("sackhint bytes rtx >= 0")); sack_changed = 1; if (SEQ_LEQ(sblkp->start, cur->start)) { /* Data acks at least the beginning of hole. */ if (SEQ_GEQ(sblkp->end, cur->end)) { /* Acks entire hole, so delete hole. */ temp = cur; cur = TAILQ_PREV(cur, sackhole_head, scblink); tcp_sackhole_remove(tp, temp); /* * The sack block may ack all or part of the * next hole too, so continue onto the next * hole. */ continue; } else { /* Move start of hole forward. */ cur->start = sblkp->end; cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); } } else { /* Data acks at least the end of hole. */ if (SEQ_GEQ(sblkp->end, cur->end)) { /* Move end of hole backward. */ cur->end = sblkp->start; cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); } else { /* * ACKs some data in middle of a hole; need * to split current hole */ temp = tcp_sackhole_insert(tp, sblkp->end, cur->end, cur); if (temp != NULL) { if (SEQ_GT(cur->rxmit, temp->rxmit)) { temp->rxmit = cur->rxmit; tp->sackhint.sack_bytes_rexmit += (temp->rxmit - temp->start); } cur->end = sblkp->start; cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); } } } tp->sackhint.sack_bytes_rexmit += (cur->rxmit - cur->start); /* * Testing sblkp->start against cur->start tells us whether * we're done with the sack block or the sack hole. * Accordingly, we advance one or the other. */ if (SEQ_LEQ(sblkp->start, cur->start)) cur = TAILQ_PREV(cur, sackhole_head, scblink); else sblkp--; } return (sack_changed); } /* * Free all SACK holes to clear the scoreboard. */ void tcp_free_sackholes(struct tcpcb *tp) { struct sackhole *q; INP_WLOCK_ASSERT(tp->t_inpcb); while ((q = TAILQ_FIRST(&tp->snd_holes)) != NULL) tcp_sackhole_remove(tp, q); tp->sackhint.sack_bytes_rexmit = 0; KASSERT(tp->snd_numholes == 0, ("tp->snd_numholes == 0")); KASSERT(tp->sackhint.nexthole == NULL, ("tp->sackhint.nexthole == NULL")); } /* * Partial ack handling within a sack recovery episode. Keeping this very * simple for now. When a partial ack is received, force snd_cwnd to a value * that will allow the sender to transmit no more than 2 segments. If * necessary, a better scheme can be adopted at a later point, but for now, * the goal is to prevent the sender from bursting a large amount of data in * the midst of sack recovery. */ void tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) { int num_segs = 1; INP_WLOCK_ASSERT(tp->t_inpcb); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; /* Send one or 2 segments based on how much new data was acked. */ if ((BYTES_THIS_ACK(tp, th) / tp->t_maxseg) >= 2) num_segs = 2; tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit + - (tp->snd_nxt - tp->sack_newdata) + num_segs * tp->t_maxseg); + (tp->snd_nxt - tp->snd_recover) + num_segs * tp->t_maxseg); if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; tp->t_flags |= TF_ACKNOW; (void) tp->t_fb->tfb_tcp_output(tp); } #if 0 /* * Debug version of tcp_sack_output() that walks the scoreboard. Used for * now to sanity check the hint. */ static struct sackhole * tcp_sack_output_debug(struct tcpcb *tp, int *sack_bytes_rexmt) { struct sackhole *p; INP_WLOCK_ASSERT(tp->t_inpcb); *sack_bytes_rexmt = 0; TAILQ_FOREACH(p, &tp->snd_holes, scblink) { if (SEQ_LT(p->rxmit, p->end)) { if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */ continue; } *sack_bytes_rexmt += (p->rxmit - p->start); break; } *sack_bytes_rexmt += (p->rxmit - p->start); } return (p); } #endif /* * Returns the next hole to retransmit and the number of retransmitted bytes * from the scoreboard. We store both the next hole and the number of * retransmitted bytes as hints (and recompute these on the fly upon SACK/ACK * reception). This avoids scoreboard traversals completely. * * The loop here will traverse *at most* one link. Here's the argument. For * the loop to traverse more than 1 link before finding the next hole to * retransmit, we would need to have at least 1 node following the current * hint with (rxmit == end). But, for all holes following the current hint, * (start == rxmit), since we have not yet retransmitted from them. * Therefore, in order to traverse more 1 link in the loop below, we need to * have at least one node following the current hint with (start == rxmit == * end). But that can't happen, (start == end) means that all the data in * that hole has been sacked, in which case, the hole would have been removed * from the scoreboard. */ struct sackhole * tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt) { struct sackhole *hole = NULL; INP_WLOCK_ASSERT(tp->t_inpcb); *sack_bytes_rexmt = tp->sackhint.sack_bytes_rexmit; hole = tp->sackhint.nexthole; if (hole == NULL || SEQ_LT(hole->rxmit, hole->end)) goto out; while ((hole = TAILQ_NEXT(hole, scblink)) != NULL) { if (SEQ_LT(hole->rxmit, hole->end)) { tp->sackhint.nexthole = hole; break; } } out: return (hole); } /* * After a timeout, the SACK list may be rebuilt. This SACK information * should be used to avoid retransmitting SACKed data. This function * traverses the SACK list to see if snd_nxt should be moved forward. */ void tcp_sack_adjust(struct tcpcb *tp) { struct sackhole *p, *cur = TAILQ_FIRST(&tp->snd_holes); INP_WLOCK_ASSERT(tp->t_inpcb); if (cur == NULL) return; /* No holes */ if (SEQ_GEQ(tp->snd_nxt, tp->snd_fack)) return; /* We're already beyond any SACKed blocks */ /*- * Two cases for which we want to advance snd_nxt: * i) snd_nxt lies between end of one hole and beginning of another * ii) snd_nxt lies between end of last hole and snd_fack */ while ((p = TAILQ_NEXT(cur, scblink)) != NULL) { if (SEQ_LT(tp->snd_nxt, cur->end)) return; if (SEQ_GEQ(tp->snd_nxt, p->start)) cur = p; else { tp->snd_nxt = p->start; return; } } if (SEQ_LT(tp->snd_nxt, cur->end)) return; tp->snd_nxt = tp->snd_fack; } Index: head/sys/netinet/tcp_usrreq.c =================================================================== --- head/sys/netinet/tcp_usrreq.c (revision 357857) +++ head/sys/netinet/tcp_usrreq.c (revision 357858) @@ -1,2831 +1,2831 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * Copyright (c) 2006-2007 Robert N. M. Watson * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_kern_tls.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif /* INET6 */ #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #endif #include #include #include #include #include #include #include #include #include #include #ifdef TCPPCAP #include #endif #ifdef TCPDEBUG #include #endif #ifdef TCP_OFFLOAD #include #endif #include #include #include #include #include #include #include /* * TCP protocol interface to socket abstraction. */ #ifdef INET static int tcp_connect(struct tcpcb *, struct sockaddr *, struct thread *td); #endif /* INET */ #ifdef INET6 static int tcp6_connect(struct tcpcb *, struct sockaddr *, struct thread *td); #endif /* INET6 */ static void tcp_disconnect(struct tcpcb *); static void tcp_usrclosed(struct tcpcb *); static void tcp_fill_info(struct tcpcb *, struct tcp_info *); #ifdef TCPDEBUG #define TCPDEBUG0 int ostate = 0 #define TCPDEBUG1() ostate = tp ? tp->t_state : 0 #define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \ tcp_trace(TA_USER, ostate, tp, 0, 0, req) #else #define TCPDEBUG0 #define TCPDEBUG1() #define TCPDEBUG2(req) #endif /* * TCP attaches to socket via pru_attach(), reserving space, * and an internet control block. */ static int tcp_usr_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; struct tcpcb *tp = NULL; int error; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp == NULL, ("tcp_usr_attach: inp != NULL")); TCPDEBUG1(); if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { error = soreserve(so, V_tcp_sendspace, V_tcp_recvspace); if (error) goto out; } so->so_rcv.sb_flags |= SB_AUTOSIZE; so->so_snd.sb_flags |= SB_AUTOSIZE; error = in_pcballoc(so, &V_tcbinfo); if (error) goto out; inp = sotoinpcb(so); #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) { inp->inp_vflag |= INP_IPV6; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) inp->inp_vflag |= INP_IPV4; inp->in6p_hops = -1; /* use kernel default */ } else #endif inp->inp_vflag |= INP_IPV4; tp = tcp_newtcpcb(inp); if (tp == NULL) { error = ENOBUFS; in_pcbdetach(inp); in_pcbfree(inp); goto out; } tp->t_state = TCPS_CLOSED; INP_WUNLOCK(inp); TCPSTATES_INC(TCPS_CLOSED); if ((so->so_options & SO_LINGER) && so->so_linger == 0) so->so_linger = TCP_LINGERTIME; out: TCPDEBUG2(PRU_ATTACH); TCP_PROBE2(debug__user, tp, PRU_ATTACH); return (error); } /* * tcp_usr_detach is called when the socket layer loses its final reference * to the socket, be it a file descriptor reference, a reference from TCP, * etc. At this point, there is only one case in which we will keep around * inpcb state: time wait. */ static void tcp_usr_detach(struct socket *so) { struct inpcb *inp; struct tcpcb *tp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); INP_WLOCK(inp); KASSERT(so->so_pcb == inp && inp->inp_socket == so, ("%s: socket %p inp %p mismatch", __func__, so, inp)); tp = intotcpcb(inp); if (inp->inp_flags & INP_TIMEWAIT) { /* * There are two cases to handle: one in which the time wait * state is being discarded (INP_DROPPED), and one in which * this connection will remain in timewait. In the former, * it is time to discard all state (except tcptw, which has * already been discarded by the timewait close code, which * should be further up the call stack somewhere). In the * latter case, we detach from the socket, but leave the pcb * present until timewait ends. * * XXXRW: Would it be cleaner to free the tcptw here? * * Astute question indeed, from twtcp perspective there are * four cases to consider: * * #1 tcp_usr_detach is called at tcptw creation time by * tcp_twstart, then do not discard the newly created tcptw * and leave inpcb present until timewait ends * #2 tcp_usr_detach is called at tcptw creation time by * tcp_twstart, but connection is local and tw will be * discarded immediately * #3 tcp_usr_detach is called at timewait end (or reuse) by * tcp_twclose, then the tcptw has already been discarded * (or reused) and inpcb is freed here * #4 tcp_usr_detach is called() after timewait ends (or reuse) * (e.g. by soclose), then tcptw has already been discarded * (or reused) and inpcb is freed here * * In all three cases the tcptw should not be freed here. */ if (inp->inp_flags & INP_DROPPED) { in_pcbdetach(inp); if (__predict_true(tp == NULL)) { in_pcbfree(inp); } else { /* * This case should not happen as in TIMEWAIT * state the inp should not be destroyed before * its tcptw. If INVARIANTS is defined, panic. */ #ifdef INVARIANTS panic("%s: Panic before an inp double-free: " "INP_TIMEWAIT && INP_DROPPED && tp != NULL" , __func__); #else log(LOG_ERR, "%s: Avoid an inp double-free: " "INP_TIMEWAIT && INP_DROPPED && tp != NULL" , __func__); #endif INP_WUNLOCK(inp); } } else { in_pcbdetach(inp); INP_WUNLOCK(inp); } } else { /* * If the connection is not in timewait, we consider two * two conditions: one in which no further processing is * necessary (dropped || embryonic), and one in which TCP is * not yet done, but no longer requires the socket, so the * pcb will persist for the time being. * * XXXRW: Does the second case still occur? */ if (inp->inp_flags & INP_DROPPED || tp->t_state < TCPS_SYN_SENT) { tcp_discardcb(tp); in_pcbdetach(inp); in_pcbfree(inp); } else { in_pcbdetach(inp); INP_WUNLOCK(inp); } } } #ifdef INET /* * Give the socket an address. */ static int tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; struct sockaddr_in *sinp; sinp = (struct sockaddr_in *)nam; if (nam->sa_len != sizeof (*sinp)) return (EINVAL); /* * Must check for multicast addresses and disallow binding * to them. */ if (sinp->sin_family == AF_INET && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) return (EAFNOSUPPORT); TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = EINVAL; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); INP_HASH_WLOCK(&V_tcbinfo); error = in_pcbbind(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(&V_tcbinfo); out: TCPDEBUG2(PRU_BIND); TCP_PROBE2(debug__user, tp, PRU_BIND); INP_WUNLOCK(inp); return (error); } #endif /* INET */ #ifdef INET6 static int tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; struct sockaddr_in6 *sin6; u_char vflagsav; sin6 = (struct sockaddr_in6 *)nam; if (nam->sa_len != sizeof (*sin6)) return (EINVAL); /* * Must check for multicast addresses and disallow binding * to them. */ if (sin6->sin6_family == AF_INET6 && IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return (EAFNOSUPPORT); TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL")); INP_WLOCK(inp); vflagsav = inp->inp_vflag; if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = EINVAL; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); INP_HASH_WLOCK(&V_tcbinfo); inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; #ifdef INET if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) inp->inp_vflag |= INP_IPV4; else if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { struct sockaddr_in sin; in6_sin6_2_sin(&sin, sin6); if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) { error = EAFNOSUPPORT; INP_HASH_WUNLOCK(&V_tcbinfo); goto out; } inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; error = in_pcbbind(inp, (struct sockaddr *)&sin, td->td_ucred); INP_HASH_WUNLOCK(&V_tcbinfo); goto out; } } #endif error = in6_pcbbind(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(&V_tcbinfo); out: if (error != 0) inp->inp_vflag = vflagsav; TCPDEBUG2(PRU_BIND); TCP_PROBE2(debug__user, tp, PRU_BIND); INP_WUNLOCK(inp); return (error); } #endif /* INET6 */ #ifdef INET /* * Prepare to accept connections. */ static int tcp_usr_listen(struct socket *so, int backlog, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = EINVAL; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); SOCK_LOCK(so); error = solisten_proto_check(so); INP_HASH_WLOCK(&V_tcbinfo); if (error == 0 && inp->inp_lport == 0) error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); INP_HASH_WUNLOCK(&V_tcbinfo); if (error == 0) { tcp_state_change(tp, TCPS_LISTEN); solisten_proto(so, backlog); #ifdef TCP_OFFLOAD if ((so->so_options & SO_NO_OFFLOAD) == 0) tcp_offload_listen_start(tp); #endif } SOCK_UNLOCK(so); if (IS_FASTOPEN(tp->t_flags)) tp->t_tfo_pending = tcp_fastopen_alloc_counter(); out: TCPDEBUG2(PRU_LISTEN); TCP_PROBE2(debug__user, tp, PRU_LISTEN); INP_WUNLOCK(inp); return (error); } #endif /* INET */ #ifdef INET6 static int tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; u_char vflagsav; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = EINVAL; goto out; } vflagsav = inp->inp_vflag; tp = intotcpcb(inp); TCPDEBUG1(); SOCK_LOCK(so); error = solisten_proto_check(so); INP_HASH_WLOCK(&V_tcbinfo); if (error == 0 && inp->inp_lport == 0) { inp->inp_vflag &= ~INP_IPV4; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) inp->inp_vflag |= INP_IPV4; error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); } INP_HASH_WUNLOCK(&V_tcbinfo); if (error == 0) { tcp_state_change(tp, TCPS_LISTEN); solisten_proto(so, backlog); #ifdef TCP_OFFLOAD if ((so->so_options & SO_NO_OFFLOAD) == 0) tcp_offload_listen_start(tp); #endif } SOCK_UNLOCK(so); if (IS_FASTOPEN(tp->t_flags)) tp->t_tfo_pending = tcp_fastopen_alloc_counter(); if (error != 0) inp->inp_vflag = vflagsav; out: TCPDEBUG2(PRU_LISTEN); TCP_PROBE2(debug__user, tp, PRU_LISTEN); INP_WUNLOCK(inp); return (error); } #endif /* INET6 */ #ifdef INET /* * Initiate connection to peer. * Create a template for use in transmissions on this connection. * Enter SYN_SENT state, and mark socket as connecting. * Start keep-alive timer, and seed output sequence space. * Send initial segment on connection. */ static int tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct epoch_tracker et; int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; struct sockaddr_in *sinp; sinp = (struct sockaddr_in *)nam; if (nam->sa_len != sizeof (*sinp)) return (EINVAL); /* * Must disallow TCP ``connections'' to multicast addresses. */ if (sinp->sin_family == AF_INET && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) return (EAFNOSUPPORT); if ((error = prison_remote_ip4(td->td_ucred, &sinp->sin_addr)) != 0) return (error); TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & INP_TIMEWAIT) { error = EADDRINUSE; goto out; } if (inp->inp_flags & INP_DROPPED) { error = ECONNREFUSED; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); NET_EPOCH_ENTER(et); if ((error = tcp_connect(tp, nam, td)) != 0) goto out_in_epoch; #ifdef TCP_OFFLOAD if (registered_toedevs > 0 && (so->so_options & SO_NO_OFFLOAD) == 0 && (error = tcp_offload_connect(so, nam)) == 0) goto out_in_epoch; #endif tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); error = tp->t_fb->tfb_tcp_output(tp); out_in_epoch: NET_EPOCH_EXIT(et); out: TCPDEBUG2(PRU_CONNECT); TCP_PROBE2(debug__user, tp, PRU_CONNECT); INP_WUNLOCK(inp); return (error); } #endif /* INET */ #ifdef INET6 static int tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct epoch_tracker et; int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; struct sockaddr_in6 *sin6; u_int8_t incflagsav; u_char vflagsav; TCPDEBUG0; sin6 = (struct sockaddr_in6 *)nam; if (nam->sa_len != sizeof (*sin6)) return (EINVAL); /* * Must disallow TCP ``connections'' to multicast addresses. */ if (sin6->sin6_family == AF_INET6 && IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return (EAFNOSUPPORT); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL")); INP_WLOCK(inp); vflagsav = inp->inp_vflag; incflagsav = inp->inp_inc.inc_flags; if (inp->inp_flags & INP_TIMEWAIT) { error = EADDRINUSE; goto out; } if (inp->inp_flags & INP_DROPPED) { error = ECONNREFUSED; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); #ifdef INET /* * XXXRW: Some confusion: V4/V6 flags relate to binding, and * therefore probably require the hash lock, which isn't held here. * Is this a significant problem? */ if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { struct sockaddr_in sin; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { error = EINVAL; goto out; } if ((inp->inp_vflag & INP_IPV4) == 0) { error = EAFNOSUPPORT; goto out; } in6_sin6_2_sin(&sin, sin6); if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) { error = EAFNOSUPPORT; goto out; } if ((error = prison_remote_ip4(td->td_ucred, &sin.sin_addr)) != 0) goto out; inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; NET_EPOCH_ENTER(et); if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0) goto out_in_epoch; #ifdef TCP_OFFLOAD if (registered_toedevs > 0 && (so->so_options & SO_NO_OFFLOAD) == 0 && (error = tcp_offload_connect(so, nam)) == 0) goto out_in_epoch; #endif error = tp->t_fb->tfb_tcp_output(tp); goto out_in_epoch; } else { if ((inp->inp_vflag & INP_IPV6) == 0) { error = EAFNOSUPPORT; goto out; } } #endif if ((error = prison_remote_ip6(td->td_ucred, &sin6->sin6_addr)) != 0) goto out; inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; inp->inp_inc.inc_flags |= INC_ISIPV6; if ((error = tcp6_connect(tp, nam, td)) != 0) goto out; #ifdef TCP_OFFLOAD if (registered_toedevs > 0 && (so->so_options & SO_NO_OFFLOAD) == 0 && (error = tcp_offload_connect(so, nam)) == 0) goto out; #endif tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); NET_EPOCH_ENTER(et); error = tp->t_fb->tfb_tcp_output(tp); #ifdef INET out_in_epoch: #endif NET_EPOCH_EXIT(et); out: /* * If the implicit bind in the connect call fails, restore * the flags we modified. */ if (error != 0 && inp->inp_lport == 0) { inp->inp_vflag = vflagsav; inp->inp_inc.inc_flags = incflagsav; } TCPDEBUG2(PRU_CONNECT); TCP_PROBE2(debug__user, tp, PRU_CONNECT); INP_WUNLOCK(inp); return (error); } #endif /* INET6 */ /* * Initiate disconnect from peer. * If connection never passed embryonic stage, just drop; * else if don't need to let data drain, then can just drop anyways, * else have to begin TCP shutdown process: mark socket disconnecting, * drain unread data, state switch to reflect user close, and * send segment (e.g. FIN) to peer. Socket will be really disconnected * when peer sends FIN and acks ours. * * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. */ static int tcp_usr_disconnect(struct socket *so) { struct inpcb *inp; struct tcpcb *tp = NULL; struct epoch_tracker et; int error = 0; TCPDEBUG0; NET_EPOCH_ENTER(et); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & INP_TIMEWAIT) goto out; if (inp->inp_flags & INP_DROPPED) { error = ECONNRESET; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); tcp_disconnect(tp); out: TCPDEBUG2(PRU_DISCONNECT); TCP_PROBE2(debug__user, tp, PRU_DISCONNECT); INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); return (error); } #ifdef INET /* * Accept a connection. Essentially all the work is done at higher levels; * just return the address of the peer, storing through addr. */ static int tcp_usr_accept(struct socket *so, struct sockaddr **nam) { int error = 0; struct inpcb *inp = NULL; struct tcpcb *tp = NULL; struct in_addr addr; in_port_t port = 0; TCPDEBUG0; if (so->so_state & SS_ISDISCONNECTED) return (ECONNABORTED); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNABORTED; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); /* * We inline in_getpeeraddr and COMMON_END here, so that we can * copy the data of interest and defer the malloc until after we * release the lock. */ port = inp->inp_fport; addr = inp->inp_faddr; out: TCPDEBUG2(PRU_ACCEPT); TCP_PROBE2(debug__user, tp, PRU_ACCEPT); INP_WUNLOCK(inp); if (error == 0) *nam = in_sockaddr(port, &addr); return error; } #endif /* INET */ #ifdef INET6 static int tcp6_usr_accept(struct socket *so, struct sockaddr **nam) { struct inpcb *inp = NULL; int error = 0; struct tcpcb *tp = NULL; struct in_addr addr; struct in6_addr addr6; struct epoch_tracker et; in_port_t port = 0; int v4 = 0; TCPDEBUG0; if (so->so_state & SS_ISDISCONNECTED) return (ECONNABORTED); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL")); NET_EPOCH_ENTER(et); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNABORTED; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); /* * We inline in6_mapped_peeraddr and COMMON_END here, so that we can * copy the data of interest and defer the malloc until after we * release the lock. */ if (inp->inp_vflag & INP_IPV4) { v4 = 1; port = inp->inp_fport; addr = inp->inp_faddr; } else { port = inp->inp_fport; addr6 = inp->in6p_faddr; } out: TCPDEBUG2(PRU_ACCEPT); TCP_PROBE2(debug__user, tp, PRU_ACCEPT); INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); if (error == 0) { if (v4) *nam = in6_v4mapsin6_sockaddr(port, &addr); else *nam = in6_sockaddr(port, &addr6); } return error; } #endif /* INET6 */ /* * Mark the connection as being incapable of further output. */ static int tcp_usr_shutdown(struct socket *so) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; struct epoch_tracker et; TCPDEBUG0; NET_EPOCH_ENTER(et); inp = sotoinpcb(so); KASSERT(inp != NULL, ("inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNRESET; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); socantsendmore(so); tcp_usrclosed(tp); if (!(inp->inp_flags & INP_DROPPED)) error = tp->t_fb->tfb_tcp_output(tp); out: TCPDEBUG2(PRU_SHUTDOWN); TCP_PROBE2(debug__user, tp, PRU_SHUTDOWN); INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); return (error); } /* * After a receive, possibly send window update to peer. */ static int tcp_usr_rcvd(struct socket *so, int flags) { struct epoch_tracker et; struct inpcb *inp; struct tcpcb *tp = NULL; int error = 0; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_rcvd: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNRESET; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); /* * For passively-created TFO connections, don't attempt a window * update while still in SYN_RECEIVED as this may trigger an early * SYN|ACK. It is preferable to have the SYN|ACK be sent along with * application response data, or failing that, when the DELACK timer * expires. */ if (IS_FASTOPEN(tp->t_flags) && (tp->t_state == TCPS_SYN_RECEIVED)) goto out; NET_EPOCH_ENTER(et); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) tcp_offload_rcvd(tp); else #endif tp->t_fb->tfb_tcp_output(tp); NET_EPOCH_EXIT(et); out: TCPDEBUG2(PRU_RCVD); TCP_PROBE2(debug__user, tp, PRU_RCVD); INP_WUNLOCK(inp); return (error); } /* * Do a send by putting data in output queue and updating urgent * marker if URG set. Possibly send more data. Unlike the other * pru_*() routines, the mbuf chains are our responsibility. We * must either enqueue them or free them. The other pru_* routines * generally are caller-frees. */ static int tcp_usr_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { struct epoch_tracker et; int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; #ifdef INET #ifdef INET6 struct sockaddr_in sin; #endif struct sockaddr_in *sinp; #endif #ifdef INET6 int isipv6; #endif u_int8_t incflagsav; u_char vflagsav; bool restoreflags; TCPDEBUG0; /* * We require the pcbinfo "read lock" if we will close the socket * as part of this call. */ NET_EPOCH_ENTER(et); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL")); INP_WLOCK(inp); vflagsav = inp->inp_vflag; incflagsav = inp->inp_inc.inc_flags; restoreflags = false; if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { if (control) m_freem(control); /* * In case of PRUS_NOTREADY, tcp_usr_ready() is responsible * for freeing memory. */ if (m && (flags & PRUS_NOTREADY) == 0) m_freem(m); error = ECONNRESET; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); if (nam != NULL && tp->t_state < TCPS_SYN_SENT) { switch (nam->sa_family) { #ifdef INET case AF_INET: sinp = (struct sockaddr_in *)nam; if (sinp->sin_len != sizeof(struct sockaddr_in)) { if (m) m_freem(m); error = EINVAL; goto out; } if ((inp->inp_vflag & INP_IPV6) != 0) { if (m) m_freem(m); error = EAFNOSUPPORT; goto out; } if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { if (m) m_freem(m); error = EAFNOSUPPORT; goto out; } if ((error = prison_remote_ip4(td->td_ucred, &sinp->sin_addr))) { if (m) m_freem(m); goto out; } #ifdef INET6 isipv6 = 0; #endif break; #endif /* INET */ #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)nam; if (sin6->sin6_len != sizeof(*sin6)) { if (m) m_freem(m); error = EINVAL; goto out; } if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { if (m) m_freem(m); error = EAFNOSUPPORT; goto out; } if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { #ifdef INET if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { error = EINVAL; if (m) m_freem(m); goto out; } if ((inp->inp_vflag & INP_IPV4) == 0) { error = EAFNOSUPPORT; if (m) m_freem(m); goto out; } restoreflags = true; inp->inp_vflag &= ~INP_IPV6; sinp = &sin; in6_sin6_2_sin(sinp, sin6); if (IN_MULTICAST( ntohl(sinp->sin_addr.s_addr))) { error = EAFNOSUPPORT; if (m) m_freem(m); goto out; } if ((error = prison_remote_ip4(td->td_ucred, &sinp->sin_addr))) { if (m) m_freem(m); goto out; } isipv6 = 0; #else /* !INET */ error = EAFNOSUPPORT; if (m) m_freem(m); goto out; #endif /* INET */ } else { if ((inp->inp_vflag & INP_IPV6) == 0) { if (m) m_freem(m); error = EAFNOSUPPORT; goto out; } restoreflags = true; inp->inp_vflag &= ~INP_IPV4; inp->inp_inc.inc_flags |= INC_ISIPV6; if ((error = prison_remote_ip6(td->td_ucred, &sin6->sin6_addr))) { if (m) m_freem(m); goto out; } isipv6 = 1; } break; } #endif /* INET6 */ default: if (m) m_freem(m); error = EAFNOSUPPORT; goto out; } } if (control) { /* TCP doesn't do control messages (rights, creds, etc) */ if (control->m_len) { m_freem(control); if (m) m_freem(m); error = EINVAL; goto out; } m_freem(control); /* empty control, just free it */ } if (!(flags & PRUS_OOB)) { sbappendstream(&so->so_snd, m, flags); if (nam && tp->t_state < TCPS_SYN_SENT) { /* * Do implied connect if not yet connected, * initialize window to default value, and * initialize maxseg using peer's cached MSS. */ #ifdef INET6 if (isipv6) error = tcp6_connect(tp, nam, td); #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET error = tcp_connect(tp, (struct sockaddr *)sinp, td); #endif /* * The bind operation in tcp_connect succeeded. We * no longer want to restore the flags if later * operations fail. */ if (error == 0 || inp->inp_lport != 0) restoreflags = false; if (error) goto out; if (IS_FASTOPEN(tp->t_flags)) tcp_fastopen_connect(tp); else { tp->snd_wnd = TTCP_CLIENT_SND_WND; tcp_mss(tp, -1); } } if (flags & PRUS_EOF) { /* * Close the send side of the connection after * the data is sent. */ socantsendmore(so); tcp_usrclosed(tp); } if (!(inp->inp_flags & INP_DROPPED) && !(flags & PRUS_NOTREADY)) { if (flags & PRUS_MORETOCOME) tp->t_flags |= TF_MORETOCOME; error = tp->t_fb->tfb_tcp_output(tp); if (flags & PRUS_MORETOCOME) tp->t_flags &= ~TF_MORETOCOME; } } else { /* * XXXRW: PRUS_EOF not implemented with PRUS_OOB? */ SOCKBUF_LOCK(&so->so_snd); if (sbspace(&so->so_snd) < -512) { SOCKBUF_UNLOCK(&so->so_snd); m_freem(m); error = ENOBUFS; goto out; } /* * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section. * Otherwise, snd_up should be one lower. */ sbappendstream_locked(&so->so_snd, m, flags); SOCKBUF_UNLOCK(&so->so_snd); if (nam && tp->t_state < TCPS_SYN_SENT) { /* * Do implied connect if not yet connected, * initialize window to default value, and * initialize maxseg using peer's cached MSS. */ /* * Not going to contemplate SYN|URG */ if (IS_FASTOPEN(tp->t_flags)) tp->t_flags &= ~TF_FASTOPEN; #ifdef INET6 if (isipv6) error = tcp6_connect(tp, nam, td); #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET error = tcp_connect(tp, (struct sockaddr *)sinp, td); #endif /* * The bind operation in tcp_connect succeeded. We * no longer want to restore the flags if later * operations fail. */ if (error == 0 || inp->inp_lport != 0) restoreflags = false; if (error) goto out; tp->snd_wnd = TTCP_CLIENT_SND_WND; tcp_mss(tp, -1); } tp->snd_up = tp->snd_una + sbavail(&so->so_snd); if (!(flags & PRUS_NOTREADY)) { tp->t_flags |= TF_FORCEDATA; error = tp->t_fb->tfb_tcp_output(tp); tp->t_flags &= ~TF_FORCEDATA; } } TCP_LOG_EVENT(tp, NULL, &inp->inp_socket->so_rcv, &inp->inp_socket->so_snd, TCP_LOG_USERSEND, error, 0, NULL, false); out: /* * If the request was unsuccessful and we changed flags, * restore the original flags. */ if (error != 0 && restoreflags) { inp->inp_vflag = vflagsav; inp->inp_inc.inc_flags = incflagsav; } TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB : ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); TCP_PROBE2(debug__user, tp, (flags & PRUS_OOB) ? PRU_SENDOOB : ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); return (error); } static int tcp_usr_ready(struct socket *so, struct mbuf *m, int count) { struct epoch_tracker et; struct inpcb *inp; struct tcpcb *tp; int error; inp = sotoinpcb(so); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); mb_free_notready(m, count); return (ECONNRESET); } tp = intotcpcb(inp); SOCKBUF_LOCK(&so->so_snd); error = sbready(&so->so_snd, m, count); SOCKBUF_UNLOCK(&so->so_snd); if (error == 0) { NET_EPOCH_ENTER(et); error = tp->t_fb->tfb_tcp_output(tp); NET_EPOCH_EXIT(et); } INP_WUNLOCK(inp); return (error); } /* * Abort the TCP. Drop the connection abruptly. */ static void tcp_usr_abort(struct socket *so) { struct inpcb *inp; struct tcpcb *tp = NULL; struct epoch_tracker et; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL")); NET_EPOCH_ENTER(et); INP_WLOCK(inp); KASSERT(inp->inp_socket != NULL, ("tcp_usr_abort: inp_socket == NULL")); /* * If we still have full TCP state, and we're not dropped, drop. */ if (!(inp->inp_flags & INP_TIMEWAIT) && !(inp->inp_flags & INP_DROPPED)) { tp = intotcpcb(inp); TCPDEBUG1(); tp = tcp_drop(tp, ECONNABORTED); if (tp == NULL) goto dropped; TCPDEBUG2(PRU_ABORT); TCP_PROBE2(debug__user, tp, PRU_ABORT); } if (!(inp->inp_flags & INP_DROPPED)) { SOCK_LOCK(so); so->so_state |= SS_PROTOREF; SOCK_UNLOCK(so); inp->inp_flags |= INP_SOCKREF; } INP_WUNLOCK(inp); dropped: NET_EPOCH_EXIT(et); } /* * TCP socket is closed. Start friendly disconnect. */ static void tcp_usr_close(struct socket *so) { struct inpcb *inp; struct tcpcb *tp = NULL; struct epoch_tracker et; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL")); NET_EPOCH_ENTER(et); INP_WLOCK(inp); KASSERT(inp->inp_socket != NULL, ("tcp_usr_close: inp_socket == NULL")); /* * If we still have full TCP state, and we're not dropped, initiate * a disconnect. */ if (!(inp->inp_flags & INP_TIMEWAIT) && !(inp->inp_flags & INP_DROPPED)) { tp = intotcpcb(inp); TCPDEBUG1(); tcp_disconnect(tp); TCPDEBUG2(PRU_CLOSE); TCP_PROBE2(debug__user, tp, PRU_CLOSE); } if (!(inp->inp_flags & INP_DROPPED)) { SOCK_LOCK(so); so->so_state |= SS_PROTOREF; SOCK_UNLOCK(so); inp->inp_flags |= INP_SOCKREF; } INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); } /* * Receive out-of-band data. */ static int tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_rcvoob: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNRESET; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); if ((so->so_oobmark == 0 && (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) || so->so_options & SO_OOBINLINE || tp->t_oobflags & TCPOOB_HADDATA) { error = EINVAL; goto out; } if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { error = EWOULDBLOCK; goto out; } m->m_len = 1; *mtod(m, caddr_t) = tp->t_iobc; if ((flags & MSG_PEEK) == 0) tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); out: TCPDEBUG2(PRU_RCVOOB); TCP_PROBE2(debug__user, tp, PRU_RCVOOB); INP_WUNLOCK(inp); return (error); } #ifdef INET struct pr_usrreqs tcp_usrreqs = { .pru_abort = tcp_usr_abort, .pru_accept = tcp_usr_accept, .pru_attach = tcp_usr_attach, .pru_bind = tcp_usr_bind, .pru_connect = tcp_usr_connect, .pru_control = in_control, .pru_detach = tcp_usr_detach, .pru_disconnect = tcp_usr_disconnect, .pru_listen = tcp_usr_listen, .pru_peeraddr = in_getpeeraddr, .pru_rcvd = tcp_usr_rcvd, .pru_rcvoob = tcp_usr_rcvoob, .pru_send = tcp_usr_send, .pru_ready = tcp_usr_ready, .pru_shutdown = tcp_usr_shutdown, .pru_sockaddr = in_getsockaddr, .pru_sosetlabel = in_pcbsosetlabel, .pru_close = tcp_usr_close, }; #endif /* INET */ #ifdef INET6 struct pr_usrreqs tcp6_usrreqs = { .pru_abort = tcp_usr_abort, .pru_accept = tcp6_usr_accept, .pru_attach = tcp_usr_attach, .pru_bind = tcp6_usr_bind, .pru_connect = tcp6_usr_connect, .pru_control = in6_control, .pru_detach = tcp_usr_detach, .pru_disconnect = tcp_usr_disconnect, .pru_listen = tcp6_usr_listen, .pru_peeraddr = in6_mapped_peeraddr, .pru_rcvd = tcp_usr_rcvd, .pru_rcvoob = tcp_usr_rcvoob, .pru_send = tcp_usr_send, .pru_ready = tcp_usr_ready, .pru_shutdown = tcp_usr_shutdown, .pru_sockaddr = in6_mapped_sockaddr, .pru_sosetlabel = in_pcbsosetlabel, .pru_close = tcp_usr_close, }; #endif /* INET6 */ #ifdef INET /* * Common subroutine to open a TCP connection to remote host specified * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local * port number if needed. Call in_pcbconnect_setup to do the routing and * to choose a local host address (interface). If there is an existing * incarnation of the same connection in TIME-WAIT state and if the remote * host was sending CC options and if the connection duration was < MSL, then * truncate the previous TIME-WAIT state and proceed. * Initialize connection parameters and enter SYN-SENT state. */ static int tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) { struct inpcb *inp = tp->t_inpcb, *oinp; struct socket *so = inp->inp_socket; struct in_addr laddr; u_short lport; int error; NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK(&V_tcbinfo); if (inp->inp_lport == 0) { error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); if (error) goto out; } /* * Cannot simply call in_pcbconnect, because there might be an * earlier incarnation of this same connection still in * TIME_WAIT state, creating an ADDRINUSE error. */ laddr = inp->inp_laddr; lport = inp->inp_lport; error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport, &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred); if (error && oinp == NULL) goto out; if (oinp) { error = EADDRINUSE; goto out; } inp->inp_laddr = laddr; in_pcbrehash(inp); INP_HASH_WUNLOCK(&V_tcbinfo); /* * Compute window scaling to request: * Scale to fit into sweet spot. See tcp_syncache.c. * XXX: This should move to tcp_output(). */ while (tp->request_r_scale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << tp->request_r_scale) < sb_max) tp->request_r_scale++; soisconnecting(so); TCPSTAT_INC(tcps_connattempt); tcp_state_change(tp, TCPS_SYN_SENT); tp->iss = tcp_new_isn(&inp->inp_inc); if (tp->t_flags & TF_REQ_TSTMP) tp->ts_offset = tcp_new_ts_offset(&inp->inp_inc); tcp_sendseqinit(tp); return 0; out: INP_HASH_WUNLOCK(&V_tcbinfo); return (error); } #endif /* INET */ #ifdef INET6 static int tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) { struct inpcb *inp = tp->t_inpcb; int error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK(&V_tcbinfo); if (inp->inp_lport == 0) { error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); if (error) goto out; } error = in6_pcbconnect(inp, nam, td->td_ucred); if (error != 0) goto out; INP_HASH_WUNLOCK(&V_tcbinfo); /* Compute window scaling to request. */ while (tp->request_r_scale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << tp->request_r_scale) < sb_max) tp->request_r_scale++; soisconnecting(inp->inp_socket); TCPSTAT_INC(tcps_connattempt); tcp_state_change(tp, TCPS_SYN_SENT); tp->iss = tcp_new_isn(&inp->inp_inc); if (tp->t_flags & TF_REQ_TSTMP) tp->ts_offset = tcp_new_ts_offset(&inp->inp_inc); tcp_sendseqinit(tp); return 0; out: INP_HASH_WUNLOCK(&V_tcbinfo); return error; } #endif /* INET6 */ /* * Export TCP internal state information via a struct tcp_info, based on the * Linux 2.6 API. Not ABI compatible as our constants are mapped differently * (TCP state machine, etc). We export all information using FreeBSD-native * constants -- for example, the numeric values for tcpi_state will differ * from Linux. */ static void tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) { INP_WLOCK_ASSERT(tp->t_inpcb); bzero(ti, sizeof(*ti)); ti->tcpi_state = tp->t_state; if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; if (tp->t_flags & TF_SACK_PERMIT) ti->tcpi_options |= TCPI_OPT_SACK; if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { ti->tcpi_options |= TCPI_OPT_WSCALE; ti->tcpi_snd_wscale = tp->snd_scale; ti->tcpi_rcv_wscale = tp->rcv_scale; } if (tp->t_flags2 & TF2_ECN_PERMIT) ti->tcpi_options |= TCPI_OPT_ECN; ti->tcpi_rto = tp->t_rxtcur * tick; ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick; ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT; ti->tcpi_rttvar = ((u_int64_t)tp->t_rttvar * tick) >> TCP_RTTVAR_SHIFT; ti->tcpi_snd_ssthresh = tp->snd_ssthresh; ti->tcpi_snd_cwnd = tp->snd_cwnd; /* * FreeBSD-specific extension fields for tcp_info. */ ti->tcpi_rcv_space = tp->rcv_wnd; ti->tcpi_rcv_nxt = tp->rcv_nxt; ti->tcpi_snd_wnd = tp->snd_wnd; ti->tcpi_snd_bwnd = 0; /* Unused, kept for compat. */ ti->tcpi_snd_nxt = tp->snd_nxt; ti->tcpi_snd_mss = tp->t_maxseg; ti->tcpi_rcv_mss = tp->t_maxseg; ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack; ti->tcpi_rcv_ooopack = tp->t_rcvoopack; ti->tcpi_snd_zerowin = tp->t_sndzerowin; #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) { ti->tcpi_options |= TCPI_OPT_TOE; tcp_offload_tcp_info(tp, ti); } #endif } /* * tcp_ctloutput() must drop the inpcb lock before performing copyin on * socket option arguments. When it re-acquires the lock after the copy, it * has to revalidate that the connection is still valid for the socket * option. */ #define INP_WLOCK_RECHECK_CLEANUP(inp, cleanup) do { \ INP_WLOCK(inp); \ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { \ INP_WUNLOCK(inp); \ cleanup; \ return (ECONNRESET); \ } \ tp = intotcpcb(inp); \ } while(0) #define INP_WLOCK_RECHECK(inp) INP_WLOCK_RECHECK_CLEANUP((inp), /* noop */) int tcp_ctloutput(struct socket *so, struct sockopt *sopt) { int error; struct inpcb *inp; struct tcpcb *tp; struct tcp_function_block *blk; struct tcp_function_set fsn; error = 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL")); if (sopt->sopt_level != IPPROTO_TCP) { #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) { error = ip6_ctloutput(so, sopt); /* * In case of the IPV6_USE_MIN_MTU socket option, * the INC_IPV6MINMTU flag to announce a corresponding * MSS during the initial handshake. * If the TCP connection is not in the front states, * just reduce the MSS being used. * This avoids the sending of TCP segments which will * be fragmented at the IPv6 layer. */ if ((error == 0) && (sopt->sopt_dir == SOPT_SET) && (sopt->sopt_level == IPPROTO_IPV6) && (sopt->sopt_name == IPV6_USE_MIN_MTU)) { INP_WLOCK(inp); if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED))) { INP_WUNLOCK(inp); return (ECONNRESET); } inp->inp_inc.inc_flags |= INC_IPV6MINMTU; tp = intotcpcb(inp); if ((tp->t_state >= TCPS_SYN_SENT) && (inp->inp_inc.inc_flags & INC_ISIPV6)) { struct ip6_pktopts *opt; opt = inp->in6p_outputopts; if ((opt != NULL) && (opt->ip6po_minmtu == IP6PO_MINMTU_ALL)) { if (tp->t_maxseg > TCP6_MSS) { tp->t_maxseg = TCP6_MSS; } } } INP_WUNLOCK(inp); } } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET { error = ip_ctloutput(so, sopt); } #endif return (error); } INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); return (ECONNRESET); } tp = intotcpcb(inp); /* * Protect the TCP option TCP_FUNCTION_BLK so * that a sub-function can *never* overwrite this. */ if ((sopt->sopt_dir == SOPT_SET) && (sopt->sopt_name == TCP_FUNCTION_BLK)) { INP_WUNLOCK(inp); error = sooptcopyin(sopt, &fsn, sizeof fsn, sizeof fsn); if (error) return (error); INP_WLOCK_RECHECK(inp); blk = find_and_ref_tcp_functions(&fsn); if (blk == NULL) { INP_WUNLOCK(inp); return (ENOENT); } if (tp->t_fb == blk) { /* You already have this */ refcount_release(&blk->tfb_refcnt); INP_WUNLOCK(inp); return (0); } if (tp->t_state != TCPS_CLOSED) { /* * The user has advanced the state * past the initial point, we may not * be able to switch. */ if (blk->tfb_tcp_handoff_ok != NULL) { /* * Does the stack provide a * query mechanism, if so it may * still be possible? */ error = (*blk->tfb_tcp_handoff_ok)(tp); } else error = EINVAL; if (error) { refcount_release(&blk->tfb_refcnt); INP_WUNLOCK(inp); return(error); } } if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) { refcount_release(&blk->tfb_refcnt); INP_WUNLOCK(inp); return (ENOENT); } /* * Release the old refcnt, the * lookup acquired a ref on the * new one already. */ if (tp->t_fb->tfb_tcp_fb_fini) { /* * Tell the stack to cleanup with 0 i.e. * the tcb is not going away. */ (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); } #ifdef TCPHPTS /* Assure that we are not on any hpts */ tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_ALL); #endif if (blk->tfb_tcp_fb_init) { error = (*blk->tfb_tcp_fb_init)(tp); if (error) { refcount_release(&blk->tfb_refcnt); if (tp->t_fb->tfb_tcp_fb_init) { if((*tp->t_fb->tfb_tcp_fb_init)(tp) != 0) { /* Fall back failed, drop the connection */ INP_WUNLOCK(inp); soabort(so); return(error); } } goto err_out; } } refcount_release(&tp->t_fb->tfb_refcnt); tp->t_fb = blk; #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) { tcp_offload_ctloutput(tp, sopt->sopt_dir, sopt->sopt_name); } #endif err_out: INP_WUNLOCK(inp); return (error); } else if ((sopt->sopt_dir == SOPT_GET) && (sopt->sopt_name == TCP_FUNCTION_BLK)) { strncpy(fsn.function_set_name, tp->t_fb->tfb_tcp_block_name, TCP_FUNCTION_NAME_LEN_MAX); fsn.function_set_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; fsn.pcbcnt = tp->t_fb->tfb_refcnt; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &fsn, sizeof fsn); return (error); } /* Pass in the INP locked, called must unlock it */ return (tp->t_fb->tfb_tcp_ctloutput(so, sopt, inp, tp)); } /* * If this assert becomes untrue, we need to change the size of the buf * variable in tcp_default_ctloutput(). */ #ifdef CTASSERT CTASSERT(TCP_CA_NAME_MAX <= TCP_LOG_ID_LEN); CTASSERT(TCP_LOG_REASON_LEN <= TCP_LOG_ID_LEN); #endif int tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) { int error, opt, optval; u_int ui; struct tcp_info ti; #ifdef KERN_TLS struct tls_enable tls; #endif struct cc_algo *algo; char *pbuf, buf[TCP_LOG_ID_LEN]; #ifdef STATS struct statsblob *sbp; #endif size_t len; /* * For TCP_CCALGOOPT forward the control to CC module, for both * SOPT_SET and SOPT_GET. */ switch (sopt->sopt_name) { case TCP_CCALGOOPT: INP_WUNLOCK(inp); if (sopt->sopt_valsize > CC_ALGOOPT_LIMIT) return (EINVAL); pbuf = malloc(sopt->sopt_valsize, M_TEMP, M_WAITOK | M_ZERO); error = sooptcopyin(sopt, pbuf, sopt->sopt_valsize, sopt->sopt_valsize); if (error) { free(pbuf, M_TEMP); return (error); } INP_WLOCK_RECHECK_CLEANUP(inp, free(pbuf, M_TEMP)); if (CC_ALGO(tp)->ctl_output != NULL) error = CC_ALGO(tp)->ctl_output(tp->ccv, sopt, pbuf); else error = ENOENT; INP_WUNLOCK(inp); if (error == 0 && sopt->sopt_dir == SOPT_GET) error = sooptcopyout(sopt, pbuf, sopt->sopt_valsize); free(pbuf, M_TEMP); return (error); } switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) case TCP_MD5SIG: if (!TCPMD5_ENABLED()) { INP_WUNLOCK(inp); return (ENOPROTOOPT); } error = TCPMD5_PCBCTL(inp, sopt); if (error) return (error); goto unlock_and_done; #endif /* IPSEC */ case TCP_NODELAY: case TCP_NOOPT: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); INP_WLOCK_RECHECK(inp); switch (sopt->sopt_name) { case TCP_NODELAY: opt = TF_NODELAY; break; case TCP_NOOPT: opt = TF_NOOPT; break; default: opt = 0; /* dead code to fool gcc */ break; } if (optval) tp->t_flags |= opt; else tp->t_flags &= ~opt; unlock_and_done: #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) { tcp_offload_ctloutput(tp, sopt->sopt_dir, sopt->sopt_name); } #endif INP_WUNLOCK(inp); break; case TCP_NOPUSH: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); INP_WLOCK_RECHECK(inp); if (optval) tp->t_flags |= TF_NOPUSH; else if (tp->t_flags & TF_NOPUSH) { tp->t_flags &= ~TF_NOPUSH; if (TCPS_HAVEESTABLISHED(tp->t_state)) { struct epoch_tracker et; NET_EPOCH_ENTER(et); error = tp->t_fb->tfb_tcp_output(tp); NET_EPOCH_EXIT(et); } } goto unlock_and_done; case TCP_MAXSEG: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); INP_WLOCK_RECHECK(inp); if (optval > 0 && optval <= tp->t_maxseg && optval + 40 >= V_tcp_minmss) tp->t_maxseg = optval; else error = EINVAL; goto unlock_and_done; case TCP_INFO: INP_WUNLOCK(inp); error = EINVAL; break; case TCP_STATS: INP_WUNLOCK(inp); #ifdef STATS error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); if (optval > 0) sbp = stats_blob_alloc( V_tcp_perconn_stats_dflt_tpl, 0); else sbp = NULL; INP_WLOCK_RECHECK(inp); if ((tp->t_stats != NULL && sbp == NULL) || (tp->t_stats == NULL && sbp != NULL)) { struct statsblob *t = tp->t_stats; tp->t_stats = sbp; sbp = t; } INP_WUNLOCK(inp); stats_blob_destroy(sbp); #else return (EOPNOTSUPP); #endif /* !STATS */ break; case TCP_CONGESTION: INP_WUNLOCK(inp); error = sooptcopyin(sopt, buf, TCP_CA_NAME_MAX - 1, 1); if (error) break; buf[sopt->sopt_valsize] = '\0'; INP_WLOCK_RECHECK(inp); CC_LIST_RLOCK(); STAILQ_FOREACH(algo, &cc_list, entries) if (strncmp(buf, algo->name, TCP_CA_NAME_MAX) == 0) break; CC_LIST_RUNLOCK(); if (algo == NULL) { INP_WUNLOCK(inp); error = EINVAL; break; } /* * We hold a write lock over the tcb so it's safe to * do these things without ordering concerns. */ if (CC_ALGO(tp)->cb_destroy != NULL) CC_ALGO(tp)->cb_destroy(tp->ccv); CC_DATA(tp) = NULL; CC_ALGO(tp) = algo; /* * If something goes pear shaped initialising the new * algo, fall back to newreno (which does not * require initialisation). */ if (algo->cb_init != NULL && algo->cb_init(tp->ccv) != 0) { CC_ALGO(tp) = &newreno_cc_algo; /* * The only reason init should fail is * because of malloc. */ error = ENOMEM; } INP_WUNLOCK(inp); break; #ifdef KERN_TLS case TCP_TXTLS_ENABLE: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &tls, sizeof(tls), sizeof(tls)); if (error) break; error = ktls_enable_tx(so, &tls); break; case TCP_TXTLS_MODE: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui)); if (error) return (error); INP_WLOCK_RECHECK(inp); error = ktls_set_tx_mode(so, ui); INP_WUNLOCK(inp); break; #endif case TCP_KEEPIDLE: case TCP_KEEPINTVL: case TCP_KEEPINIT: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui)); if (error) return (error); if (ui > (UINT_MAX / hz)) { error = EINVAL; break; } ui *= hz; INP_WLOCK_RECHECK(inp); switch (sopt->sopt_name) { case TCP_KEEPIDLE: tp->t_keepidle = ui; /* * XXX: better check current remaining * timeout and "merge" it with new value. */ if ((tp->t_state > TCPS_LISTEN) && (tp->t_state <= TCPS_CLOSING)) tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); break; case TCP_KEEPINTVL: tp->t_keepintvl = ui; if ((tp->t_state == TCPS_FIN_WAIT_2) && (TP_MAXIDLE(tp) > 0)) tcp_timer_activate(tp, TT_2MSL, TP_MAXIDLE(tp)); break; case TCP_KEEPINIT: tp->t_keepinit = ui; if (tp->t_state == TCPS_SYN_RECEIVED || tp->t_state == TCPS_SYN_SENT) tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); break; } goto unlock_and_done; case TCP_KEEPCNT: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui)); if (error) return (error); INP_WLOCK_RECHECK(inp); tp->t_keepcnt = ui; if ((tp->t_state == TCPS_FIN_WAIT_2) && (TP_MAXIDLE(tp) > 0)) tcp_timer_activate(tp, TT_2MSL, TP_MAXIDLE(tp)); goto unlock_and_done; #ifdef TCPPCAP case TCP_PCAP_OUT: case TCP_PCAP_IN: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); INP_WLOCK_RECHECK(inp); if (optval >= 0) tcp_pcap_set_sock_max(TCP_PCAP_OUT ? &(tp->t_outpkts) : &(tp->t_inpkts), optval); else error = EINVAL; goto unlock_and_done; #endif case TCP_FASTOPEN: { struct tcp_fastopen tfo_optval; INP_WUNLOCK(inp); if (!V_tcp_fastopen_client_enable && !V_tcp_fastopen_server_enable) return (EPERM); error = sooptcopyin(sopt, &tfo_optval, sizeof(tfo_optval), sizeof(int)); if (error) return (error); INP_WLOCK_RECHECK(inp); if (tfo_optval.enable) { if (tp->t_state == TCPS_LISTEN) { if (!V_tcp_fastopen_server_enable) { error = EPERM; goto unlock_and_done; } tp->t_flags |= TF_FASTOPEN; if (tp->t_tfo_pending == NULL) tp->t_tfo_pending = tcp_fastopen_alloc_counter(); } else { /* * If a pre-shared key was provided, * stash it in the client cookie * field of the tcpcb for use during * connect. */ if (sopt->sopt_valsize == sizeof(tfo_optval)) { memcpy(tp->t_tfo_cookie.client, tfo_optval.psk, TCP_FASTOPEN_PSK_LEN); tp->t_tfo_client_cookie_len = TCP_FASTOPEN_PSK_LEN; } tp->t_flags |= TF_FASTOPEN; } } else tp->t_flags &= ~TF_FASTOPEN; goto unlock_and_done; } #ifdef TCP_BLACKBOX case TCP_LOG: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); INP_WLOCK_RECHECK(inp); error = tcp_log_state_change(tp, optval); goto unlock_and_done; case TCP_LOGBUF: INP_WUNLOCK(inp); error = EINVAL; break; case TCP_LOGID: INP_WUNLOCK(inp); error = sooptcopyin(sopt, buf, TCP_LOG_ID_LEN - 1, 0); if (error) break; buf[sopt->sopt_valsize] = '\0'; INP_WLOCK_RECHECK(inp); error = tcp_log_set_id(tp, buf); /* tcp_log_set_id() unlocks the INP. */ break; case TCP_LOGDUMP: case TCP_LOGDUMPID: INP_WUNLOCK(inp); error = sooptcopyin(sopt, buf, TCP_LOG_REASON_LEN - 1, 0); if (error) break; buf[sopt->sopt_valsize] = '\0'; INP_WLOCK_RECHECK(inp); if (sopt->sopt_name == TCP_LOGDUMP) { error = tcp_log_dump_tp_logbuf(tp, buf, M_WAITOK, true); INP_WUNLOCK(inp); } else { tcp_log_dump_tp_bucket_logbufs(tp, buf); /* * tcp_log_dump_tp_bucket_logbufs() drops the * INP lock. */ } break; #endif default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } break; case SOPT_GET: tp = intotcpcb(inp); switch (sopt->sopt_name) { #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) case TCP_MD5SIG: if (!TCPMD5_ENABLED()) { INP_WUNLOCK(inp); return (ENOPROTOOPT); } error = TCPMD5_PCBCTL(inp, sopt); break; #endif case TCP_NODELAY: optval = tp->t_flags & TF_NODELAY; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; case TCP_MAXSEG: optval = tp->t_maxseg; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; case TCP_NOOPT: optval = tp->t_flags & TF_NOOPT; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; case TCP_NOPUSH: optval = tp->t_flags & TF_NOPUSH; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; case TCP_INFO: tcp_fill_info(tp, &ti); INP_WUNLOCK(inp); error = sooptcopyout(sopt, &ti, sizeof ti); break; case TCP_STATS: { #ifdef STATS int nheld; TYPEOF_MEMBER(struct statsblob, flags) sbflags = 0; error = 0; socklen_t outsbsz = sopt->sopt_valsize; if (tp->t_stats == NULL) error = ENOENT; else if (outsbsz >= tp->t_stats->cursz) outsbsz = tp->t_stats->cursz; else if (outsbsz >= sizeof(struct statsblob)) outsbsz = sizeof(struct statsblob); else error = EINVAL; INP_WUNLOCK(inp); if (error) break; sbp = sopt->sopt_val; nheld = atop(round_page(((vm_offset_t)sbp) + (vm_size_t)outsbsz) - trunc_page((vm_offset_t)sbp)); vm_page_t ma[nheld]; if (vm_fault_quick_hold_pages( &curproc->p_vmspace->vm_map, (vm_offset_t)sbp, outsbsz, VM_PROT_READ | VM_PROT_WRITE, ma, nheld) < 0) { error = EFAULT; break; } if ((error = copyin_nofault(&(sbp->flags), &sbflags, SIZEOF_MEMBER(struct statsblob, flags)))) goto unhold; INP_WLOCK_RECHECK(inp); error = stats_blob_snapshot(&sbp, outsbsz, tp->t_stats, sbflags | SB_CLONE_USRDSTNOFAULT); INP_WUNLOCK(inp); sopt->sopt_valsize = outsbsz; unhold: vm_page_unhold_pages(ma, nheld); #else INP_WUNLOCK(inp); error = EOPNOTSUPP; #endif /* !STATS */ break; } case TCP_CONGESTION: len = strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX); INP_WUNLOCK(inp); error = sooptcopyout(sopt, buf, len + 1); break; case TCP_KEEPIDLE: case TCP_KEEPINTVL: case TCP_KEEPINIT: case TCP_KEEPCNT: switch (sopt->sopt_name) { case TCP_KEEPIDLE: ui = TP_KEEPIDLE(tp) / hz; break; case TCP_KEEPINTVL: ui = TP_KEEPINTVL(tp) / hz; break; case TCP_KEEPINIT: ui = TP_KEEPINIT(tp) / hz; break; case TCP_KEEPCNT: ui = TP_KEEPCNT(tp); break; } INP_WUNLOCK(inp); error = sooptcopyout(sopt, &ui, sizeof(ui)); break; #ifdef TCPPCAP case TCP_PCAP_OUT: case TCP_PCAP_IN: optval = tcp_pcap_get_sock_max(TCP_PCAP_OUT ? &(tp->t_outpkts) : &(tp->t_inpkts)); INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; #endif case TCP_FASTOPEN: optval = tp->t_flags & TF_FASTOPEN; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; #ifdef TCP_BLACKBOX case TCP_LOG: optval = tp->t_logstate; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(optval)); break; case TCP_LOGBUF: /* tcp_log_getlogbuf() does INP_WUNLOCK(inp) */ error = tcp_log_getlogbuf(sopt, tp); break; case TCP_LOGID: len = tcp_log_get_id(tp, buf); INP_WUNLOCK(inp); error = sooptcopyout(sopt, buf, len + 1); break; case TCP_LOGDUMP: case TCP_LOGDUMPID: INP_WUNLOCK(inp); error = EINVAL; break; #endif #ifdef KERN_TLS case TCP_TXTLS_MODE: optval = ktls_get_tx_mode(so); INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(optval)); break; #endif default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } break; } return (error); } #undef INP_WLOCK_RECHECK #undef INP_WLOCK_RECHECK_CLEANUP /* * Initiate (or continue) disconnect. * If embryonic state, just send reset (once). * If in ``let data drain'' option and linger null, just drop. * Otherwise (hard), mark socket disconnecting and drop * current input data; switch states based on user close, and * send segment to peer (with FIN). */ static void tcp_disconnect(struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(inp); /* * Neither tcp_close() nor tcp_drop() should return NULL, as the * socket is still open. */ if (tp->t_state < TCPS_ESTABLISHED && !(tp->t_state > TCPS_LISTEN && IS_FASTOPEN(tp->t_flags))) { tp = tcp_close(tp); KASSERT(tp != NULL, ("tcp_disconnect: tcp_close() returned NULL")); } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) { tp = tcp_drop(tp, 0); KASSERT(tp != NULL, ("tcp_disconnect: tcp_drop() returned NULL")); } else { soisdisconnecting(so); sbflush(&so->so_rcv); tcp_usrclosed(tp); if (!(inp->inp_flags & INP_DROPPED)) tp->t_fb->tfb_tcp_output(tp); } } /* * User issued close, and wish to trail through shutdown states: * if never received SYN, just forget it. If got a SYN from peer, * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. * If already got a FIN from peer, then almost done; go to LAST_ACK * state. In all other cases, have already sent FIN to peer (e.g. * after PRU_SHUTDOWN), and just have to play tedious game waiting * for peer to send FIN or not respond to keep-alives, etc. * We can let the user exit from the close as soon as the FIN is acked. */ static void tcp_usrclosed(struct tcpcb *tp) { NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(tp->t_inpcb); switch (tp->t_state) { case TCPS_LISTEN: #ifdef TCP_OFFLOAD tcp_offload_listen_stop(tp); #endif tcp_state_change(tp, TCPS_CLOSED); /* FALLTHROUGH */ case TCPS_CLOSED: tp = tcp_close(tp); /* * tcp_close() should never return NULL here as the socket is * still open. */ KASSERT(tp != NULL, ("tcp_usrclosed: tcp_close() returned NULL")); break; case TCPS_SYN_SENT: case TCPS_SYN_RECEIVED: tp->t_flags |= TF_NEEDFIN; break; case TCPS_ESTABLISHED: tcp_state_change(tp, TCPS_FIN_WAIT_1); break; case TCPS_CLOSE_WAIT: tcp_state_change(tp, TCPS_LAST_ACK); break; } if (tp->t_state >= TCPS_FIN_WAIT_2) { soisdisconnected(tp->t_inpcb->inp_socket); /* Prevent the connection hanging in FIN_WAIT_2 forever. */ if (tp->t_state == TCPS_FIN_WAIT_2) { int timeout; timeout = (tcp_fast_finwait2_recycle) ? tcp_finwait2_timeout : TP_MAXIDLE(tp); tcp_timer_activate(tp, TT_2MSL, timeout); } } } #ifdef DDB static void db_print_indent(int indent) { int i; for (i = 0; i < indent; i++) db_printf(" "); } static void db_print_tstate(int t_state) { switch (t_state) { case TCPS_CLOSED: db_printf("TCPS_CLOSED"); return; case TCPS_LISTEN: db_printf("TCPS_LISTEN"); return; case TCPS_SYN_SENT: db_printf("TCPS_SYN_SENT"); return; case TCPS_SYN_RECEIVED: db_printf("TCPS_SYN_RECEIVED"); return; case TCPS_ESTABLISHED: db_printf("TCPS_ESTABLISHED"); return; case TCPS_CLOSE_WAIT: db_printf("TCPS_CLOSE_WAIT"); return; case TCPS_FIN_WAIT_1: db_printf("TCPS_FIN_WAIT_1"); return; case TCPS_CLOSING: db_printf("TCPS_CLOSING"); return; case TCPS_LAST_ACK: db_printf("TCPS_LAST_ACK"); return; case TCPS_FIN_WAIT_2: db_printf("TCPS_FIN_WAIT_2"); return; case TCPS_TIME_WAIT: db_printf("TCPS_TIME_WAIT"); return; default: db_printf("unknown"); return; } } static void db_print_tflags(u_int t_flags) { int comma; comma = 0; if (t_flags & TF_ACKNOW) { db_printf("%sTF_ACKNOW", comma ? ", " : ""); comma = 1; } if (t_flags & TF_DELACK) { db_printf("%sTF_DELACK", comma ? ", " : ""); comma = 1; } if (t_flags & TF_NODELAY) { db_printf("%sTF_NODELAY", comma ? ", " : ""); comma = 1; } if (t_flags & TF_NOOPT) { db_printf("%sTF_NOOPT", comma ? ", " : ""); comma = 1; } if (t_flags & TF_SENTFIN) { db_printf("%sTF_SENTFIN", comma ? ", " : ""); comma = 1; } if (t_flags & TF_REQ_SCALE) { db_printf("%sTF_REQ_SCALE", comma ? ", " : ""); comma = 1; } if (t_flags & TF_RCVD_SCALE) { db_printf("%sTF_RECVD_SCALE", comma ? ", " : ""); comma = 1; } if (t_flags & TF_REQ_TSTMP) { db_printf("%sTF_REQ_TSTMP", comma ? ", " : ""); comma = 1; } if (t_flags & TF_RCVD_TSTMP) { db_printf("%sTF_RCVD_TSTMP", comma ? ", " : ""); comma = 1; } if (t_flags & TF_SACK_PERMIT) { db_printf("%sTF_SACK_PERMIT", comma ? ", " : ""); comma = 1; } if (t_flags & TF_NEEDSYN) { db_printf("%sTF_NEEDSYN", comma ? ", " : ""); comma = 1; } if (t_flags & TF_NEEDFIN) { db_printf("%sTF_NEEDFIN", comma ? ", " : ""); comma = 1; } if (t_flags & TF_NOPUSH) { db_printf("%sTF_NOPUSH", comma ? ", " : ""); comma = 1; } if (t_flags & TF_MORETOCOME) { db_printf("%sTF_MORETOCOME", comma ? ", " : ""); comma = 1; } if (t_flags & TF_LQ_OVERFLOW) { db_printf("%sTF_LQ_OVERFLOW", comma ? ", " : ""); comma = 1; } if (t_flags & TF_LASTIDLE) { db_printf("%sTF_LASTIDLE", comma ? ", " : ""); comma = 1; } if (t_flags & TF_RXWIN0SENT) { db_printf("%sTF_RXWIN0SENT", comma ? ", " : ""); comma = 1; } if (t_flags & TF_FASTRECOVERY) { db_printf("%sTF_FASTRECOVERY", comma ? ", " : ""); comma = 1; } if (t_flags & TF_CONGRECOVERY) { db_printf("%sTF_CONGRECOVERY", comma ? ", " : ""); comma = 1; } if (t_flags & TF_WASFRECOVERY) { db_printf("%sTF_WASFRECOVERY", comma ? ", " : ""); comma = 1; } if (t_flags & TF_SIGNATURE) { db_printf("%sTF_SIGNATURE", comma ? ", " : ""); comma = 1; } if (t_flags & TF_FORCEDATA) { db_printf("%sTF_FORCEDATA", comma ? ", " : ""); comma = 1; } if (t_flags & TF_TSO) { db_printf("%sTF_TSO", comma ? ", " : ""); comma = 1; } if (t_flags & TF_FASTOPEN) { db_printf("%sTF_FASTOPEN", comma ? ", " : ""); comma = 1; } } static void db_print_tflags2(u_int t_flags2) { int comma; comma = 0; if (t_flags2 & TF2_ECN_PERMIT) { db_printf("%sTF2_ECN_PERMIT", comma ? ", " : ""); comma = 1; } } static void db_print_toobflags(char t_oobflags) { int comma; comma = 0; if (t_oobflags & TCPOOB_HAVEDATA) { db_printf("%sTCPOOB_HAVEDATA", comma ? ", " : ""); comma = 1; } if (t_oobflags & TCPOOB_HADDATA) { db_printf("%sTCPOOB_HADDATA", comma ? ", " : ""); comma = 1; } } static void db_print_tcpcb(struct tcpcb *tp, const char *name, int indent) { db_print_indent(indent); db_printf("%s at %p\n", name, tp); indent += 2; db_print_indent(indent); db_printf("t_segq first: %p t_segqlen: %d t_dupacks: %d\n", TAILQ_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks); db_print_indent(indent); db_printf("tt_rexmt: %p tt_persist: %p tt_keep: %p\n", &tp->t_timers->tt_rexmt, &tp->t_timers->tt_persist, &tp->t_timers->tt_keep); db_print_indent(indent); db_printf("tt_2msl: %p tt_delack: %p t_inpcb: %p\n", &tp->t_timers->tt_2msl, &tp->t_timers->tt_delack, tp->t_inpcb); db_print_indent(indent); db_printf("t_state: %d (", tp->t_state); db_print_tstate(tp->t_state); db_printf(")\n"); db_print_indent(indent); db_printf("t_flags: 0x%x (", tp->t_flags); db_print_tflags(tp->t_flags); db_printf(")\n"); db_print_indent(indent); db_printf("t_flags2: 0x%x (", tp->t_flags2); db_print_tflags2(tp->t_flags2); db_printf(")\n"); db_print_indent(indent); db_printf("snd_una: 0x%08x snd_max: 0x%08x snd_nxt: x0%08x\n", tp->snd_una, tp->snd_max, tp->snd_nxt); db_print_indent(indent); db_printf("snd_up: 0x%08x snd_wl1: 0x%08x snd_wl2: 0x%08x\n", tp->snd_up, tp->snd_wl1, tp->snd_wl2); db_print_indent(indent); db_printf("iss: 0x%08x irs: 0x%08x rcv_nxt: 0x%08x\n", tp->iss, tp->irs, tp->rcv_nxt); db_print_indent(indent); db_printf("rcv_adv: 0x%08x rcv_wnd: %u rcv_up: 0x%08x\n", tp->rcv_adv, tp->rcv_wnd, tp->rcv_up); db_print_indent(indent); db_printf("snd_wnd: %u snd_cwnd: %u\n", tp->snd_wnd, tp->snd_cwnd); db_print_indent(indent); db_printf("snd_ssthresh: %u snd_recover: " "0x%08x\n", tp->snd_ssthresh, tp->snd_recover); db_print_indent(indent); db_printf("t_rcvtime: %u t_startime: %u\n", tp->t_rcvtime, tp->t_starttime); db_print_indent(indent); db_printf("t_rttime: %u t_rtsq: 0x%08x\n", tp->t_rtttime, tp->t_rtseq); db_print_indent(indent); db_printf("t_rxtcur: %d t_maxseg: %u t_srtt: %d\n", tp->t_rxtcur, tp->t_maxseg, tp->t_srtt); db_print_indent(indent); db_printf("t_rttvar: %d t_rxtshift: %d t_rttmin: %u " "t_rttbest: %u\n", tp->t_rttvar, tp->t_rxtshift, tp->t_rttmin, tp->t_rttbest); db_print_indent(indent); db_printf("t_rttupdated: %lu max_sndwnd: %u t_softerror: %d\n", tp->t_rttupdated, tp->max_sndwnd, tp->t_softerror); db_print_indent(indent); db_printf("t_oobflags: 0x%x (", tp->t_oobflags); db_print_toobflags(tp->t_oobflags); db_printf(") t_iobc: 0x%02x\n", tp->t_iobc); db_print_indent(indent); db_printf("snd_scale: %u rcv_scale: %u request_r_scale: %u\n", tp->snd_scale, tp->rcv_scale, tp->request_r_scale); db_print_indent(indent); db_printf("ts_recent: %u ts_recent_age: %u\n", tp->ts_recent, tp->ts_recent_age); db_print_indent(indent); db_printf("ts_offset: %u last_ack_sent: 0x%08x snd_cwnd_prev: " "%u\n", tp->ts_offset, tp->last_ack_sent, tp->snd_cwnd_prev); db_print_indent(indent); db_printf("snd_ssthresh_prev: %u snd_recover_prev: 0x%08x " "t_badrxtwin: %u\n", tp->snd_ssthresh_prev, tp->snd_recover_prev, tp->t_badrxtwin); db_print_indent(indent); db_printf("snd_numholes: %d snd_holes first: %p\n", tp->snd_numholes, TAILQ_FIRST(&tp->snd_holes)); db_print_indent(indent); - db_printf("snd_fack: 0x%08x rcv_numsacks: %d sack_newdata: " - "0x%08x\n", tp->snd_fack, tp->rcv_numsacks, tp->sack_newdata); + db_printf("snd_fack: 0x%08x rcv_numsacks: %d\n", + tp->snd_fack, tp->rcv_numsacks); /* Skip sackblks, sackhint. */ db_print_indent(indent); db_printf("t_rttlow: %d rfbuf_ts: %u rfbuf_cnt: %d\n", tp->t_rttlow, tp->rfbuf_ts, tp->rfbuf_cnt); } DB_SHOW_COMMAND(tcpcb, db_show_tcpcb) { struct tcpcb *tp; if (!have_addr) { db_printf("usage: show tcpcb \n"); return; } tp = (struct tcpcb *)addr; db_print_tcpcb(tp, "tcpcb", 0); } #endif Index: head/sys/netinet/tcp_var.h =================================================================== --- head/sys/netinet/tcp_var.h (revision 357857) +++ head/sys/netinet/tcp_var.h (revision 357858) @@ -1,1034 +1,1032 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_var.h 8.4 (Berkeley) 5/24/95 * $FreeBSD$ */ #ifndef _NETINET_TCP_VAR_H_ #define _NETINET_TCP_VAR_H_ #include #include #ifdef _KERNEL #include #include #endif #if defined(_KERNEL) || defined(_WANT_TCPCB) /* TCP segment queue entry */ struct tseg_qent { TAILQ_ENTRY(tseg_qent) tqe_q; struct mbuf *tqe_m; /* mbuf contains packet */ struct mbuf *tqe_last; /* last mbuf in chain */ tcp_seq tqe_start; /* TCP Sequence number start */ int tqe_len; /* TCP segment data length */ uint32_t tqe_flags; /* The flags from the th->th_flags */ uint32_t tqe_mbuf_cnt; /* Count of mbuf overhead */ }; TAILQ_HEAD(tsegqe_head, tseg_qent); struct sackblk { tcp_seq start; /* start seq no. of sack block */ tcp_seq end; /* end seq no. */ }; struct sackhole { tcp_seq start; /* start seq no. of hole */ tcp_seq end; /* end seq no. */ tcp_seq rxmit; /* next seq. no in hole to be retransmitted */ TAILQ_ENTRY(sackhole) scblink; /* scoreboard linkage */ }; struct sackhint { struct sackhole *nexthole; int sack_bytes_rexmit; tcp_seq last_sack_ack; /* Most recent/largest sacked ack */ int ispare; /* explicit pad for 64bit alignment */ int sacked_bytes; /* * Total sacked bytes reported by the * receiver via sack option */ uint32_t _pad1[1]; /* TBD */ uint64_t _pad[1]; /* TBD */ }; #define SEGQ_EMPTY(tp) TAILQ_EMPTY(&(tp)->t_segq) STAILQ_HEAD(tcp_log_stailq, tcp_log_mem); /* * Tcp control block, one per tcp; fields: * Organized for 64 byte cacheline efficiency based * on common tcp_input/tcp_output processing. */ struct tcpcb { /* Cache line 1 */ struct inpcb *t_inpcb; /* back pointer to internet pcb */ struct tcp_function_block *t_fb;/* TCP function call block */ void *t_fb_ptr; /* Pointer to t_fb specific data */ uint32_t t_maxseg:24, /* maximum segment size */ t_logstate:8; /* State of "black box" logging */ uint32_t t_port:16, /* Tunneling (over udp) port */ t_state:4, /* state of this connection */ t_idle_reduce : 1, t_delayed_ack: 7, /* Delayed ack variable */ t_fin_is_rst: 1, /* Are fin's treated as resets */ t_log_state_set: 1, bits_spare : 2; u_int t_flags; tcp_seq snd_una; /* sent but unacknowledged */ tcp_seq snd_max; /* highest sequence number sent; * used to recognize retransmits */ tcp_seq snd_nxt; /* send next */ tcp_seq snd_up; /* send urgent pointer */ uint32_t snd_wnd; /* send window */ uint32_t snd_cwnd; /* congestion-controlled window */ uint32_t t_peakrate_thr; /* pre-calculated peak rate threshold */ /* Cache line 2 */ u_int32_t ts_offset; /* our timestamp offset */ u_int32_t rfbuf_ts; /* recv buffer autoscaling timestamp */ int rcv_numsacks; /* # distinct sack blks present */ u_int t_tsomax; /* TSO total burst length limit in bytes */ u_int t_tsomaxsegcount; /* TSO maximum segment count */ u_int t_tsomaxsegsize; /* TSO maximum segment size in bytes */ tcp_seq rcv_nxt; /* receive next */ tcp_seq rcv_adv; /* advertised window */ uint32_t rcv_wnd; /* receive window */ u_int t_flags2; /* More tcpcb flags storage */ int t_srtt; /* smoothed round-trip time */ int t_rttvar; /* variance in round-trip time */ u_int32_t ts_recent; /* timestamp echo data */ u_char snd_scale; /* window scaling for send window */ u_char rcv_scale; /* window scaling for recv window */ u_char snd_limited; /* segments limited transmitted */ u_char request_r_scale; /* pending window scaling */ tcp_seq last_ack_sent; u_int t_rcvtime; /* inactivity time */ /* Cache line 3 */ tcp_seq rcv_up; /* receive urgent pointer */ int t_segqlen; /* segment reassembly queue length */ uint32_t t_segqmbuflen; /* Count of bytes mbufs on all entries */ struct tsegqe_head t_segq; /* segment reassembly queue */ struct mbuf *t_in_pkt; struct mbuf *t_tail_pkt; struct tcp_timer *t_timers; /* All the TCP timers in one struct */ struct vnet *t_vnet; /* back pointer to parent vnet */ uint32_t snd_ssthresh; /* snd_cwnd size threshold for * for slow start exponential to * linear switch */ tcp_seq snd_wl1; /* window update seg seq number */ /* Cache line 4 */ tcp_seq snd_wl2; /* window update seg ack number */ tcp_seq irs; /* initial receive sequence number */ tcp_seq iss; /* initial send sequence number */ u_int t_acktime; u_int ts_recent_age; /* when last updated */ tcp_seq snd_recover; /* for use in NewReno Fast Recovery */ uint16_t cl4_spare; /* Spare to adjust CL 4 */ char t_oobflags; /* have some */ char t_iobc; /* input character */ int t_rxtcur; /* current retransmit value (ticks) */ int t_rxtshift; /* log(2) of rexmt exp. backoff */ u_int t_rtttime; /* RTT measurement start time */ tcp_seq t_rtseq; /* sequence number being timed */ u_int t_starttime; /* time connection was established */ u_int t_pmtud_saved_maxseg; /* pre-blackhole MSS */ u_int t_rttmin; /* minimum rtt allowed */ u_int t_rttbest; /* best rtt we've seen */ int t_softerror; /* possible error not yet reported */ uint32_t max_sndwnd; /* largest window peer has offered */ /* Cache line 5 */ uint32_t snd_cwnd_prev; /* cwnd prior to retransmit */ uint32_t snd_ssthresh_prev; /* ssthresh prior to retransmit */ tcp_seq snd_recover_prev; /* snd_recover prior to retransmit */ int t_sndzerowin; /* zero-window updates sent */ u_long t_rttupdated; /* number of times rtt sampled */ int snd_numholes; /* number of holes seen by sender */ u_int t_badrxtwin; /* window for retransmit recovery */ TAILQ_HEAD(sackhole_head, sackhole) snd_holes; /* SACK scoreboard (sorted) */ tcp_seq snd_fack; /* last seq number(+1) sack'd by rcv'r*/ - tcp_seq sack_newdata; /* New data xmitted in this recovery - episode starts at this seq number */ struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */ struct sackhint sackhint; /* SACK scoreboard hint */ int t_rttlow; /* smallest observerved RTT */ int rfbuf_cnt; /* recv buffer autoscaling byte count */ struct toedev *tod; /* toedev handling this connection */ int t_sndrexmitpack; /* retransmit packets sent */ int t_rcvoopack; /* out-of-order packets received */ void *t_toe; /* TOE pcb pointer */ struct cc_algo *cc_algo; /* congestion control algorithm */ struct cc_var *ccv; /* congestion control specific vars */ struct osd *osd; /* storage for Khelp module data */ int t_bytes_acked; /* # bytes acked during current RTT */ u_int t_maxunacktime; u_int t_keepinit; /* time to establish connection */ u_int t_keepidle; /* time before keepalive probes begin */ u_int t_keepintvl; /* interval between keepalives */ u_int t_keepcnt; /* number of keepalives before close */ int t_dupacks; /* consecutive dup acks recd */ int t_lognum; /* Number of log entries */ int t_loglimit; /* Maximum number of log entries */ struct tcp_log_stailq t_logs; /* Log buffer */ struct tcp_log_id_node *t_lin; struct tcp_log_id_bucket *t_lib; const char *t_output_caller; /* Function that called tcp_output */ struct statsblob *t_stats; /* Per-connection stats */ uint32_t t_logsn; /* Log "serial number" */ uint32_t gput_ts; /* Time goodput measurement started */ tcp_seq gput_seq; /* Outbound measurement seq */ tcp_seq gput_ack; /* Inbound measurement ack */ int32_t t_stats_gput_prev; /* XXXLAS: Prev gput measurement */ uint8_t t_tfo_client_cookie_len; /* TCP Fast Open client cookie length */ unsigned int *t_tfo_pending; /* TCP Fast Open server pending counter */ union { uint8_t client[TCP_FASTOPEN_MAX_COOKIE_LEN]; uint64_t server; } t_tfo_cookie; /* TCP Fast Open cookie to send */ #ifdef TCPPCAP struct mbufq t_inpkts; /* List of saved input packets. */ struct mbufq t_outpkts; /* List of saved output packets. */ #endif }; #endif /* _KERNEL || _WANT_TCPCB */ #ifdef _KERNEL struct tcptemp { u_char tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */ struct tcphdr tt_t; }; /* Minimum map entries limit value, if set */ #define TCP_MIN_MAP_ENTRIES_LIMIT 128 /* * TODO: We yet need to brave plowing in * to tcp_input() and the pru_usrreq() block. * Right now these go to the old standards which * are somewhat ok, but in the long term may * need to be changed. If we do tackle tcp_input() * then we need to get rid of the tcp_do_segment() * function below. */ /* Flags for tcp functions */ #define TCP_FUNC_BEING_REMOVED 0x01 /* Can no longer be referenced */ /* * If defining the optional tcp_timers, in the * tfb_tcp_timer_stop call you must use the * callout_async_drain() function with the * tcp_timer_discard callback. You should check * the return of callout_async_drain() and if 0 * increment tt_draincnt. Since the timer sub-system * does not know your callbacks you must provide a * stop_all function that loops through and calls * tcp_timer_stop() with each of your defined timers. * Adding a tfb_tcp_handoff_ok function allows the socket * option to change stacks to query you even if the * connection is in a later stage. You return 0 to * say you can take over and run your stack, you return * non-zero (an error number) to say no you can't. * If the function is undefined you can only change * in the early states (before connect or listen). * tfb_tcp_fb_fini is changed to add a flag to tell * the old stack if the tcb is being destroyed or * not. A one in the flag means the TCB is being * destroyed, a zero indicates its transitioning to * another stack (via socket option). */ struct tcp_function_block { char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX]; int (*tfb_tcp_output)(struct tcpcb *); int (*tfb_tcp_output_wtime)(struct tcpcb *, const struct timeval *); void (*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t); int (*tfb_do_queued_segments)(struct socket *, struct tcpcb *, int); int (*tfb_do_segment_nounlock)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t, int, struct timeval *); void (*tfb_tcp_hpts_do_segment)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t, int, struct timeval *); int (*tfb_tcp_ctloutput)(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp); /* Optional memory allocation/free routine */ int (*tfb_tcp_fb_init)(struct tcpcb *); void (*tfb_tcp_fb_fini)(struct tcpcb *, int); /* Optional timers, must define all if you define one */ int (*tfb_tcp_timer_stop_all)(struct tcpcb *); void (*tfb_tcp_timer_activate)(struct tcpcb *, uint32_t, u_int); int (*tfb_tcp_timer_active)(struct tcpcb *, uint32_t); void (*tfb_tcp_timer_stop)(struct tcpcb *, uint32_t); void (*tfb_tcp_rexmit_tmr)(struct tcpcb *); int (*tfb_tcp_handoff_ok)(struct tcpcb *); void (*tfb_tcp_mtu_chg)(struct tcpcb *); volatile uint32_t tfb_refcnt; uint32_t tfb_flags; uint8_t tfb_id; }; struct tcp_function { TAILQ_ENTRY(tcp_function) tf_next; char tf_name[TCP_FUNCTION_NAME_LEN_MAX]; struct tcp_function_block *tf_fb; }; TAILQ_HEAD(tcp_funchead, tcp_function); #endif /* _KERNEL */ /* * Flags and utility macros for the t_flags field. */ #define TF_ACKNOW 0x00000001 /* ack peer immediately */ #define TF_DELACK 0x00000002 /* ack, but try to delay it */ #define TF_NODELAY 0x00000004 /* don't delay packets to coalesce */ #define TF_NOOPT 0x00000008 /* don't use tcp options */ #define TF_SENTFIN 0x00000010 /* have sent FIN */ #define TF_REQ_SCALE 0x00000020 /* have/will request window scaling */ #define TF_RCVD_SCALE 0x00000040 /* other side has requested scaling */ #define TF_REQ_TSTMP 0x00000080 /* have/will request timestamps */ #define TF_RCVD_TSTMP 0x00000100 /* a timestamp was received in SYN */ #define TF_SACK_PERMIT 0x00000200 /* other side said I could SACK */ #define TF_NEEDSYN 0x00000400 /* send SYN (implicit state) */ #define TF_NEEDFIN 0x00000800 /* send FIN (implicit state) */ #define TF_NOPUSH 0x00001000 /* don't push */ #define TF_PREVVALID 0x00002000 /* saved values for bad rxmit valid */ #define TF_UNUSED1 0x00004000 /* unused */ #define TF_GPUTINPROG 0x00008000 /* Goodput measurement in progress */ #define TF_MORETOCOME 0x00010000 /* More data to be appended to sock */ #define TF_LQ_OVERFLOW 0x00020000 /* listen queue overflow */ #define TF_LASTIDLE 0x00040000 /* connection was previously idle */ #define TF_RXWIN0SENT 0x00080000 /* sent a receiver win 0 in response */ #define TF_FASTRECOVERY 0x00100000 /* in NewReno Fast Recovery */ #define TF_WASFRECOVERY 0x00200000 /* was in NewReno Fast Recovery */ #define TF_SIGNATURE 0x00400000 /* require MD5 digests (RFC2385) */ #define TF_FORCEDATA 0x00800000 /* force out a byte */ #define TF_TSO 0x01000000 /* TSO enabled on this connection */ #define TF_TOE 0x02000000 /* this connection is offloaded */ #define TF_UNUSED3 0x04000000 /* unused */ #define TF_UNUSED4 0x08000000 /* unused */ #define TF_UNUSED5 0x10000000 /* unused */ #define TF_CONGRECOVERY 0x20000000 /* congestion recovery mode */ #define TF_WASCRECOVERY 0x40000000 /* was in congestion recovery */ #define TF_FASTOPEN 0x80000000 /* TCP Fast Open indication */ #define IN_FASTRECOVERY(t_flags) (t_flags & TF_FASTRECOVERY) #define ENTER_FASTRECOVERY(t_flags) t_flags |= TF_FASTRECOVERY #define EXIT_FASTRECOVERY(t_flags) t_flags &= ~TF_FASTRECOVERY #define IN_CONGRECOVERY(t_flags) (t_flags & TF_CONGRECOVERY) #define ENTER_CONGRECOVERY(t_flags) t_flags |= TF_CONGRECOVERY #define EXIT_CONGRECOVERY(t_flags) t_flags &= ~TF_CONGRECOVERY #define IN_RECOVERY(t_flags) (t_flags & (TF_CONGRECOVERY | TF_FASTRECOVERY)) #define ENTER_RECOVERY(t_flags) t_flags |= (TF_CONGRECOVERY | TF_FASTRECOVERY) #define EXIT_RECOVERY(t_flags) t_flags &= ~(TF_CONGRECOVERY | TF_FASTRECOVERY) #if defined(_KERNEL) && !defined(TCP_RFC7413) #define IS_FASTOPEN(t_flags) (false) #else #define IS_FASTOPEN(t_flags) (t_flags & TF_FASTOPEN) #endif #define BYTES_THIS_ACK(tp, th) (th->th_ack - tp->snd_una) /* * Flags for the t_oobflags field. */ #define TCPOOB_HAVEDATA 0x01 #define TCPOOB_HADDATA 0x02 /* * Flags for the extended TCP flags field, t_flags2 */ #define TF2_PLPMTU_BLACKHOLE 0x00000001 /* Possible PLPMTUD Black Hole. */ #define TF2_PLPMTU_PMTUD 0x00000002 /* Allowed to attempt PLPMTUD. */ #define TF2_PLPMTU_MAXSEGSNT 0x00000004 /* Last seg sent was full seg. */ #define TF2_LOG_AUTO 0x00000008 /* Session is auto-logging. */ #define TF2_DROP_AF_DATA 0x00000010 /* Drop after all data ack'd */ #define TF2_ECN_PERMIT 0x00000020 /* connection ECN-ready */ #define TF2_ECN_SND_CWR 0x00000040 /* ECN CWR in queue */ #define TF2_ECN_SND_ECE 0x00000080 /* ECN ECE in queue */ #define TF2_ACE_PERMIT 0x00000100 /* Accurate ECN mode */ /* * Structure to hold TCP options that are only used during segment * processing (in tcp_input), but not held in the tcpcb. * It's basically used to reduce the number of parameters * to tcp_dooptions and tcp_addoptions. * The binary order of the to_flags is relevant for packing of the * options in tcp_addoptions. */ struct tcpopt { u_int32_t to_flags; /* which options are present */ #define TOF_MSS 0x0001 /* maximum segment size */ #define TOF_SCALE 0x0002 /* window scaling */ #define TOF_SACKPERM 0x0004 /* SACK permitted */ #define TOF_TS 0x0010 /* timestamp */ #define TOF_SIGNATURE 0x0040 /* TCP-MD5 signature option (RFC2385) */ #define TOF_SACK 0x0080 /* Peer sent SACK option */ #define TOF_FASTOPEN 0x0100 /* TCP Fast Open (TFO) cookie */ #define TOF_MAXOPT 0x0200 u_int32_t to_tsval; /* new timestamp */ u_int32_t to_tsecr; /* reflected timestamp */ u_char *to_sacks; /* pointer to the first SACK blocks */ u_char *to_signature; /* pointer to the TCP-MD5 signature */ u_int8_t *to_tfo_cookie; /* pointer to the TFO cookie */ u_int16_t to_mss; /* maximum segment size */ u_int8_t to_wscale; /* window scaling */ u_int8_t to_nsacks; /* number of SACK blocks */ u_int8_t to_tfo_len; /* TFO cookie length */ u_int32_t to_spare; /* UTO */ }; /* * Flags for tcp_dooptions. */ #define TO_SYN 0x01 /* parse SYN-only options */ struct hc_metrics_lite { /* must stay in sync with hc_metrics */ uint32_t rmx_mtu; /* MTU for this path */ uint32_t rmx_ssthresh; /* outbound gateway buffer limit */ uint32_t rmx_rtt; /* estimated round trip time */ uint32_t rmx_rttvar; /* estimated rtt variance */ uint32_t rmx_cwnd; /* congestion window */ uint32_t rmx_sendpipe; /* outbound delay-bandwidth product */ uint32_t rmx_recvpipe; /* inbound delay-bandwidth product */ }; /* * Used by tcp_maxmtu() to communicate interface specific features * and limits at the time of connection setup. */ struct tcp_ifcap { int ifcap; u_int tsomax; u_int tsomaxsegcount; u_int tsomaxsegsize; }; #ifndef _NETINET_IN_PCB_H_ struct in_conninfo; #endif /* _NETINET_IN_PCB_H_ */ struct tcptw { struct inpcb *tw_inpcb; /* XXX back pointer to internet pcb */ tcp_seq snd_nxt; tcp_seq rcv_nxt; tcp_seq iss; tcp_seq irs; u_short last_win; /* cached window value */ short tw_so_options; /* copy of so_options */ struct ucred *tw_cred; /* user credentials */ u_int32_t t_recent; u_int32_t ts_offset; /* our timestamp offset */ u_int t_starttime; int tw_time; TAILQ_ENTRY(tcptw) tw_2msl; void *tw_pspare; /* TCP_SIGNATURE */ u_int *tw_spare; /* TCP_SIGNATURE */ }; #define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb) #define intotw(ip) ((struct tcptw *)(ip)->inp_ppcb) #define sototcpcb(so) (intotcpcb(sotoinpcb(so))) /* * The smoothed round-trip time and estimated variance * are stored as fixed point numbers scaled by the values below. * For convenience, these scales are also used in smoothing the average * (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed). * With these scales, srtt has 3 bits to the right of the binary point, * and thus an "ALPHA" of 0.875. rttvar has 2 bits to the right of the * binary point, and is smoothed with an ALPHA of 0.75. */ #define TCP_RTT_SCALE 32 /* multiplier for srtt; 3 bits frac. */ #define TCP_RTT_SHIFT 5 /* shift for srtt; 3 bits frac. */ #define TCP_RTTVAR_SCALE 16 /* multiplier for rttvar; 2 bits */ #define TCP_RTTVAR_SHIFT 4 /* shift for rttvar; 2 bits */ #define TCP_DELTA_SHIFT 2 /* see tcp_input.c */ /* * The initial retransmission should happen at rtt + 4 * rttvar. * Because of the way we do the smoothing, srtt and rttvar * will each average +1/2 tick of bias. When we compute * the retransmit timer, we want 1/2 tick of rounding and * 1 extra tick because of +-1/2 tick uncertainty in the * firing of the timer. The bias will give us exactly the * 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below * the minimum feasible timer (which is 2 ticks). * This version of the macro adapted from a paper by Lawrence * Brakmo and Larry Peterson which outlines a problem caused * by insufficient precision in the original implementation, * which results in inappropriately large RTO values for very * fast networks. */ #define TCP_REXMTVAL(tp) \ max((tp)->t_rttmin, (((tp)->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)) \ + (tp)->t_rttvar) >> TCP_DELTA_SHIFT) /* * TCP statistics. * Many of these should be kept per connection, * but that's inconvenient at the moment. */ struct tcpstat { uint64_t tcps_connattempt; /* connections initiated */ uint64_t tcps_accepts; /* connections accepted */ uint64_t tcps_connects; /* connections established */ uint64_t tcps_drops; /* connections dropped */ uint64_t tcps_conndrops; /* embryonic connections dropped */ uint64_t tcps_minmssdrops; /* average minmss too low drops */ uint64_t tcps_closed; /* conn. closed (includes drops) */ uint64_t tcps_segstimed; /* segs where we tried to get rtt */ uint64_t tcps_rttupdated; /* times we succeeded */ uint64_t tcps_delack; /* delayed acks sent */ uint64_t tcps_timeoutdrop; /* conn. dropped in rxmt timeout */ uint64_t tcps_rexmttimeo; /* retransmit timeouts */ uint64_t tcps_persisttimeo; /* persist timeouts */ uint64_t tcps_keeptimeo; /* keepalive timeouts */ uint64_t tcps_keepprobe; /* keepalive probes sent */ uint64_t tcps_keepdrops; /* connections dropped in keepalive */ uint64_t tcps_sndtotal; /* total packets sent */ uint64_t tcps_sndpack; /* data packets sent */ uint64_t tcps_sndbyte; /* data bytes sent */ uint64_t tcps_sndrexmitpack; /* data packets retransmitted */ uint64_t tcps_sndrexmitbyte; /* data bytes retransmitted */ uint64_t tcps_sndrexmitbad; /* unnecessary packet retransmissions */ uint64_t tcps_sndacks; /* ack-only packets sent */ uint64_t tcps_sndprobe; /* window probes sent */ uint64_t tcps_sndurg; /* packets sent with URG only */ uint64_t tcps_sndwinup; /* window update-only packets sent */ uint64_t tcps_sndctrl; /* control (SYN|FIN|RST) packets sent */ uint64_t tcps_rcvtotal; /* total packets received */ uint64_t tcps_rcvpack; /* packets received in sequence */ uint64_t tcps_rcvbyte; /* bytes received in sequence */ uint64_t tcps_rcvbadsum; /* packets received with ccksum errs */ uint64_t tcps_rcvbadoff; /* packets received with bad offset */ uint64_t tcps_rcvreassfull; /* packets dropped for no reass space */ uint64_t tcps_rcvshort; /* packets received too short */ uint64_t tcps_rcvduppack; /* duplicate-only packets received */ uint64_t tcps_rcvdupbyte; /* duplicate-only bytes received */ uint64_t tcps_rcvpartduppack; /* packets with some duplicate data */ uint64_t tcps_rcvpartdupbyte; /* dup. bytes in part-dup. packets */ uint64_t tcps_rcvoopack; /* out-of-order packets received */ uint64_t tcps_rcvoobyte; /* out-of-order bytes received */ uint64_t tcps_rcvpackafterwin; /* packets with data after window */ uint64_t tcps_rcvbyteafterwin; /* bytes rcvd after window */ uint64_t tcps_rcvafterclose; /* packets rcvd after "close" */ uint64_t tcps_rcvwinprobe; /* rcvd window probe packets */ uint64_t tcps_rcvdupack; /* rcvd duplicate acks */ uint64_t tcps_rcvacktoomuch; /* rcvd acks for unsent data */ uint64_t tcps_rcvackpack; /* rcvd ack packets */ uint64_t tcps_rcvackbyte; /* bytes acked by rcvd acks */ uint64_t tcps_rcvwinupd; /* rcvd window update packets */ uint64_t tcps_pawsdrop; /* segments dropped due to PAWS */ uint64_t tcps_predack; /* times hdr predict ok for acks */ uint64_t tcps_preddat; /* times hdr predict ok for data pkts */ uint64_t tcps_pcbcachemiss; uint64_t tcps_cachedrtt; /* times cached RTT in route updated */ uint64_t tcps_cachedrttvar; /* times cached rttvar updated */ uint64_t tcps_cachedssthresh; /* times cached ssthresh updated */ uint64_t tcps_usedrtt; /* times RTT initialized from route */ uint64_t tcps_usedrttvar; /* times RTTVAR initialized from rt */ uint64_t tcps_usedssthresh; /* times ssthresh initialized from rt*/ uint64_t tcps_persistdrop; /* timeout in persist state */ uint64_t tcps_badsyn; /* bogus SYN, e.g. premature ACK */ uint64_t tcps_mturesent; /* resends due to MTU discovery */ uint64_t tcps_listendrop; /* listen queue overflows */ uint64_t tcps_badrst; /* ignored RSTs in the window */ uint64_t tcps_sc_added; /* entry added to syncache */ uint64_t tcps_sc_retransmitted; /* syncache entry was retransmitted */ uint64_t tcps_sc_dupsyn; /* duplicate SYN packet */ uint64_t tcps_sc_dropped; /* could not reply to packet */ uint64_t tcps_sc_completed; /* successful extraction of entry */ uint64_t tcps_sc_bucketoverflow;/* syncache per-bucket limit hit */ uint64_t tcps_sc_cacheoverflow; /* syncache cache limit hit */ uint64_t tcps_sc_reset; /* RST removed entry from syncache */ uint64_t tcps_sc_stale; /* timed out or listen socket gone */ uint64_t tcps_sc_aborted; /* syncache entry aborted */ uint64_t tcps_sc_badack; /* removed due to bad ACK */ uint64_t tcps_sc_unreach; /* ICMP unreachable received */ uint64_t tcps_sc_zonefail; /* zalloc() failed */ uint64_t tcps_sc_sendcookie; /* SYN cookie sent */ uint64_t tcps_sc_recvcookie; /* SYN cookie received */ uint64_t tcps_hc_added; /* entry added to hostcache */ uint64_t tcps_hc_bucketoverflow;/* hostcache per bucket limit hit */ uint64_t tcps_finwait2_drops; /* Drop FIN_WAIT_2 connection after time limit */ /* SACK related stats */ uint64_t tcps_sack_recovery_episode; /* SACK recovery episodes */ uint64_t tcps_sack_rexmits; /* SACK rexmit segments */ uint64_t tcps_sack_rexmit_bytes; /* SACK rexmit bytes */ uint64_t tcps_sack_rcv_blocks; /* SACK blocks (options) received */ uint64_t tcps_sack_send_blocks; /* SACK blocks (options) sent */ uint64_t tcps_sack_sboverflow; /* times scoreboard overflowed */ /* ECN related stats */ uint64_t tcps_ecn_ce; /* ECN Congestion Experienced */ uint64_t tcps_ecn_ect0; /* ECN Capable Transport */ uint64_t tcps_ecn_ect1; /* ECN Capable Transport */ uint64_t tcps_ecn_shs; /* ECN successful handshakes */ uint64_t tcps_ecn_rcwnd; /* # times ECN reduced the cwnd */ /* TCP_SIGNATURE related stats */ uint64_t tcps_sig_rcvgoodsig; /* Total matching signature received */ uint64_t tcps_sig_rcvbadsig; /* Total bad signature received */ uint64_t tcps_sig_err_buildsig; /* Failed to make signature */ uint64_t tcps_sig_err_sigopt; /* No signature expected by socket */ uint64_t tcps_sig_err_nosigopt; /* No signature provided by segment */ /* Path MTU Discovery Black Hole Detection related stats */ uint64_t tcps_pmtud_blackhole_activated; /* Black Hole Count */ uint64_t tcps_pmtud_blackhole_activated_min_mss; /* BH at min MSS Count */ uint64_t tcps_pmtud_blackhole_failed; /* Black Hole Failure Count */ uint64_t _pad[12]; /* 6 UTO, 6 TBD */ }; #define tcps_rcvmemdrop tcps_rcvreassfull /* compat */ #ifdef _KERNEL #define TI_UNLOCKED 1 #define TI_RLOCKED 2 #include VNET_PCPUSTAT_DECLARE(struct tcpstat, tcpstat); /* tcp statistics */ /* * In-kernel consumers can use these accessor macros directly to update * stats. */ #define TCPSTAT_ADD(name, val) \ VNET_PCPUSTAT_ADD(struct tcpstat, tcpstat, name, (val)) #define TCPSTAT_INC(name) TCPSTAT_ADD(name, 1) /* * Kernel module consumers must use this accessor macro. */ void kmod_tcpstat_inc(int statnum); #define KMOD_TCPSTAT_INC(name) \ kmod_tcpstat_inc(offsetof(struct tcpstat, name) / sizeof(uint64_t)) /* * Running TCP connection count by state. */ VNET_DECLARE(counter_u64_t, tcps_states[TCP_NSTATES]); #define V_tcps_states VNET(tcps_states) #define TCPSTATES_INC(state) counter_u64_add(V_tcps_states[state], 1) #define TCPSTATES_DEC(state) counter_u64_add(V_tcps_states[state], -1) /* * TCP specific helper hook point identifiers. */ #define HHOOK_TCP_EST_IN 0 #define HHOOK_TCP_EST_OUT 1 #define HHOOK_TCP_LAST HHOOK_TCP_EST_OUT struct tcp_hhook_data { struct tcpcb *tp; struct tcphdr *th; struct tcpopt *to; uint32_t len; int tso; tcp_seq curack; }; #ifdef TCP_HHOOK void hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, uint32_t len, int tso); #endif #endif /* * TCB structure exported to user-land via sysctl(3). * * Fields prefixed with "xt_" are unique to the export structure, and fields * with "t_" or other prefixes match corresponding fields of 'struct tcpcb'. * * Legend: * (s) - used by userland utilities in src * (p) - used by utilities in ports * (3) - is known to be used by third party software not in ports * (n) - no known usage * * Evil hack: declare only if in_pcb.h and sys/socketvar.h have been * included. Not all of our clients do. */ #if defined(_NETINET_IN_PCB_H_) && defined(_SYS_SOCKETVAR_H_) struct xtcpcb { ksize_t xt_len; /* length of this structure */ struct xinpcb xt_inp; char xt_stack[TCP_FUNCTION_NAME_LEN_MAX]; /* (s) */ char xt_logid[TCP_LOG_ID_LEN]; /* (s) */ int64_t spare64[8]; int32_t t_state; /* (s,p) */ uint32_t t_flags; /* (s,p) */ int32_t t_sndzerowin; /* (s) */ int32_t t_sndrexmitpack; /* (s) */ int32_t t_rcvoopack; /* (s) */ int32_t t_rcvtime; /* (s) */ int32_t tt_rexmt; /* (s) */ int32_t tt_persist; /* (s) */ int32_t tt_keep; /* (s) */ int32_t tt_2msl; /* (s) */ int32_t tt_delack; /* (s) */ int32_t t_logstate; /* (3) */ int32_t spare32[32]; } __aligned(8); #ifdef _KERNEL void tcp_inptoxtp(const struct inpcb *, struct xtcpcb *); #endif #endif /* * TCP function information (name-to-id mapping, aliases, and refcnt) * exported to user-land via sysctl(3). */ struct tcp_function_info { uint32_t tfi_refcnt; uint8_t tfi_id; char tfi_name[TCP_FUNCTION_NAME_LEN_MAX]; char tfi_alias[TCP_FUNCTION_NAME_LEN_MAX]; }; /* * Identifiers for TCP sysctl nodes */ #define TCPCTL_DO_RFC1323 1 /* use RFC-1323 extensions */ #define TCPCTL_MSSDFLT 3 /* MSS default */ #define TCPCTL_STATS 4 /* statistics */ #define TCPCTL_RTTDFLT 5 /* default RTT estimate */ #define TCPCTL_KEEPIDLE 6 /* keepalive idle timer */ #define TCPCTL_KEEPINTVL 7 /* interval to send keepalives */ #define TCPCTL_SENDSPACE 8 /* send buffer space */ #define TCPCTL_RECVSPACE 9 /* receive buffer space */ #define TCPCTL_KEEPINIT 10 /* timeout for establishing syn */ #define TCPCTL_PCBLIST 11 /* list of all outstanding PCBs */ #define TCPCTL_DELACKTIME 12 /* time before sending delayed ACK */ #define TCPCTL_V6MSSDFLT 13 /* MSS default for IPv6 */ #define TCPCTL_SACK 14 /* Selective Acknowledgement,rfc 2018 */ #define TCPCTL_DROP 15 /* drop tcp connection */ #define TCPCTL_STATES 16 /* connection counts by TCP state */ #ifdef _KERNEL #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet_tcp); SYSCTL_DECL(_net_inet_tcp_sack); MALLOC_DECLARE(M_TCPLOG); #endif VNET_DECLARE(int, tcp_log_in_vain); #define V_tcp_log_in_vain VNET(tcp_log_in_vain) /* * Global TCP tunables shared between different stacks. * Please keep the list sorted. */ VNET_DECLARE(int, drop_synfin); VNET_DECLARE(int, path_mtu_discovery); VNET_DECLARE(int, tcp_abc_l_var); VNET_DECLARE(int, tcp_autorcvbuf_max); VNET_DECLARE(int, tcp_autosndbuf_inc); VNET_DECLARE(int, tcp_autosndbuf_max); VNET_DECLARE(int, tcp_delack_enabled); VNET_DECLARE(int, tcp_do_autorcvbuf); VNET_DECLARE(int, tcp_do_autosndbuf); VNET_DECLARE(int, tcp_do_ecn); VNET_DECLARE(int, tcp_do_newcwv); VNET_DECLARE(int, tcp_do_rfc1323); VNET_DECLARE(int, tcp_do_rfc3042); VNET_DECLARE(int, tcp_do_rfc3390); VNET_DECLARE(int, tcp_do_rfc3465); VNET_DECLARE(int, tcp_do_rfc6675_pipe); VNET_DECLARE(int, tcp_do_sack); VNET_DECLARE(int, tcp_do_tso); VNET_DECLARE(int, tcp_ecn_maxretries); VNET_DECLARE(int, tcp_initcwnd_segments); VNET_DECLARE(int, tcp_insecure_rst); VNET_DECLARE(int, tcp_insecure_syn); VNET_DECLARE(uint32_t, tcp_map_entries_limit); VNET_DECLARE(uint32_t, tcp_map_split_limit); VNET_DECLARE(int, tcp_minmss); VNET_DECLARE(int, tcp_mssdflt); #ifdef STATS VNET_DECLARE(int, tcp_perconn_stats_dflt_tpl); VNET_DECLARE(int, tcp_perconn_stats_enable); #endif /* STATS */ VNET_DECLARE(int, tcp_recvspace); VNET_DECLARE(int, tcp_sack_globalholes); VNET_DECLARE(int, tcp_sack_globalmaxholes); VNET_DECLARE(int, tcp_sack_maxholes); VNET_DECLARE(int, tcp_sc_rst_sock_fail); VNET_DECLARE(int, tcp_sendspace); VNET_DECLARE(struct inpcbhead, tcb); VNET_DECLARE(struct inpcbinfo, tcbinfo); #define V_tcp_do_newcwv VNET(tcp_do_newcwv) #define V_drop_synfin VNET(drop_synfin) #define V_path_mtu_discovery VNET(path_mtu_discovery) #define V_tcb VNET(tcb) #define V_tcbinfo VNET(tcbinfo) #define V_tcp_abc_l_var VNET(tcp_abc_l_var) #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) #define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc) #define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) #define V_tcp_delack_enabled VNET(tcp_delack_enabled) #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) #define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) #define V_tcp_do_ecn VNET(tcp_do_ecn) #define V_tcp_do_rfc1323 VNET(tcp_do_rfc1323) #define V_tcp_ts_offset_per_conn VNET(tcp_ts_offset_per_conn) #define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042) #define V_tcp_do_rfc3390 VNET(tcp_do_rfc3390) #define V_tcp_do_rfc3465 VNET(tcp_do_rfc3465) #define V_tcp_do_rfc6675_pipe VNET(tcp_do_rfc6675_pipe) #define V_tcp_do_sack VNET(tcp_do_sack) #define V_tcp_do_tso VNET(tcp_do_tso) #define V_tcp_ecn_maxretries VNET(tcp_ecn_maxretries) #define V_tcp_initcwnd_segments VNET(tcp_initcwnd_segments) #define V_tcp_insecure_rst VNET(tcp_insecure_rst) #define V_tcp_insecure_syn VNET(tcp_insecure_syn) #define V_tcp_map_entries_limit VNET(tcp_map_entries_limit) #define V_tcp_map_split_limit VNET(tcp_map_split_limit) #define V_tcp_minmss VNET(tcp_minmss) #define V_tcp_mssdflt VNET(tcp_mssdflt) #ifdef STATS #define V_tcp_perconn_stats_dflt_tpl VNET(tcp_perconn_stats_dflt_tpl) #define V_tcp_perconn_stats_enable VNET(tcp_perconn_stats_enable) #endif /* STATS */ #define V_tcp_recvspace VNET(tcp_recvspace) #define V_tcp_sack_globalholes VNET(tcp_sack_globalholes) #define V_tcp_sack_globalmaxholes VNET(tcp_sack_globalmaxholes) #define V_tcp_sack_maxholes VNET(tcp_sack_maxholes) #define V_tcp_sc_rst_sock_fail VNET(tcp_sc_rst_sock_fail) #define V_tcp_sendspace VNET(tcp_sendspace) #define V_tcp_udp_tunneling_overhead VNET(tcp_udp_tunneling_overhead) #define V_tcp_udp_tunneling_port VNET(tcp_udp_tunneling_port) #ifdef TCP_HHOOK VNET_DECLARE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST + 1]); #define V_tcp_hhh VNET(tcp_hhh) #endif int tcp_addoptions(struct tcpopt *, u_char *); int tcp_ccalgounload(struct cc_algo *unload_algo); struct tcpcb * tcp_close(struct tcpcb *); void tcp_discardcb(struct tcpcb *); void tcp_twstart(struct tcpcb *); void tcp_twclose(struct tcptw *, int); void tcp_ctlinput(int, struct sockaddr *, void *); int tcp_ctloutput(struct socket *, struct sockopt *); struct tcpcb * tcp_drop(struct tcpcb *, int); void tcp_drain(void); void tcp_init(void); void tcp_fini(void *); char *tcp_log_addrs(struct in_conninfo *, struct tcphdr *, void *, const void *); char *tcp_log_vain(struct in_conninfo *, struct tcphdr *, void *, const void *); int tcp_reass(struct tcpcb *, struct tcphdr *, tcp_seq *, int *, struct mbuf *); void tcp_reass_global_init(void); void tcp_reass_flush(struct tcpcb *); void tcp_dooptions(struct tcpopt *, u_char *, int, int); void tcp_dropwithreset(struct mbuf *, struct tcphdr *, struct tcpcb *, int, int); void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); void tcp_xmit_timer(struct tcpcb *, int); void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); void cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t nsegs, uint16_t type); void cc_conn_init(struct tcpcb *tp); void cc_post_recovery(struct tcpcb *tp, struct tcphdr *th); void cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos); void cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type); #ifdef TCP_HHOOK void hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to); #endif int tcp_input(struct mbuf **, int *, int); int tcp_autorcvbuf(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int); void tcp_do_segment(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t); int register_tcp_functions(struct tcp_function_block *blk, int wait); int register_tcp_functions_as_names(struct tcp_function_block *blk, int wait, const char *names[], int *num_names); int register_tcp_functions_as_name(struct tcp_function_block *blk, const char *name, int wait); int deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce, bool force); struct tcp_function_block *find_and_ref_tcp_functions(struct tcp_function_set *fs); void tcp_switch_back_to_default(struct tcpcb *tp); struct tcp_function_block * find_and_ref_tcp_fb(struct tcp_function_block *fs); int tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp); extern counter_u64_t tcp_inp_lro_direct_queue; extern counter_u64_t tcp_inp_lro_wokeup_queue; extern counter_u64_t tcp_inp_lro_compressed; extern counter_u64_t tcp_inp_lro_single_push; extern counter_u64_t tcp_inp_lro_locks_taken; extern counter_u64_t tcp_inp_lro_sack_wake; #ifdef NETFLIX_EXP_DETECTION /* Various SACK attack thresholds */ extern int32_t tcp_force_detection; extern int32_t tcp_sack_to_ack_thresh; extern int32_t tcp_sack_to_move_thresh; extern int32_t tcp_restoral_thresh; extern int32_t tcp_sad_decay_val; extern int32_t tcp_sad_pacing_interval; extern int32_t tcp_sad_low_pps; extern int32_t tcp_map_minimum; extern int32_t tcp_attack_on_turns_on_logging; #endif uint32_t tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *); uint32_t tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *); u_int tcp_maxseg(const struct tcpcb *); void tcp_mss_update(struct tcpcb *, int, int, struct hc_metrics_lite *, struct tcp_ifcap *); void tcp_mss(struct tcpcb *, int); int tcp_mssopt(struct in_conninfo *); struct inpcb * tcp_drop_syn_sent(struct inpcb *, int); struct tcpcb * tcp_newtcpcb(struct inpcb *); int tcp_output(struct tcpcb *); void tcp_state_change(struct tcpcb *, int); void tcp_respond(struct tcpcb *, void *, struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int); void tcp_tw_init(void); #ifdef VIMAGE void tcp_tw_destroy(void); #endif void tcp_tw_zone_change(void); int tcp_twcheck(struct inpcb *, struct tcpopt *, struct tcphdr *, struct mbuf *, int); void tcp_setpersist(struct tcpcb *); void tcp_slowtimo(void); struct tcptemp * tcpip_maketemplate(struct inpcb *); void tcpip_fillheaders(struct inpcb *, void *, void *); void tcp_timer_activate(struct tcpcb *, uint32_t, u_int); int tcp_timer_suspend(struct tcpcb *, uint32_t); void tcp_timers_unsuspend(struct tcpcb *, uint32_t); int tcp_timer_active(struct tcpcb *, uint32_t); void tcp_timer_stop(struct tcpcb *, uint32_t); void tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int); int inp_to_cpuid(struct inpcb *inp); /* * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo) */ void tcp_hc_init(void); #ifdef VIMAGE void tcp_hc_destroy(void); #endif void tcp_hc_get(struct in_conninfo *, struct hc_metrics_lite *); uint32_t tcp_hc_getmtu(struct in_conninfo *); void tcp_hc_updatemtu(struct in_conninfo *, uint32_t); void tcp_hc_update(struct in_conninfo *, struct hc_metrics_lite *); extern struct pr_usrreqs tcp_usrreqs; uint32_t tcp_new_ts_offset(struct in_conninfo *); tcp_seq tcp_new_isn(struct in_conninfo *); int tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq); void tcp_update_dsack_list(struct tcpcb *, tcp_seq, tcp_seq); void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend); void tcp_clean_dsack_blocks(struct tcpcb *tp); void tcp_clean_sackreport(struct tcpcb *tp); void tcp_sack_adjust(struct tcpcb *tp); struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt); void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); void tcp_free_sackholes(struct tcpcb *tp); int tcp_newreno(struct tcpcb *, struct tcphdr *); int tcp_compute_pipe(struct tcpcb *); uint32_t tcp_compute_initwnd(uint32_t); void tcp_sndbuf_autoscale(struct tcpcb *, struct socket *, uint32_t); int tcp_stats_sample_rollthedice(struct tcpcb *tp, void *seed_bytes, size_t seed_len); struct mbuf * tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen, int32_t seglimit, int32_t segsize, struct sockbuf *sb, bool hw_tls); int tcp_stats_init(void); static inline void tcp_fields_to_host(struct tcphdr *th) { th->th_seq = ntohl(th->th_seq); th->th_ack = ntohl(th->th_ack); th->th_win = ntohs(th->th_win); th->th_urp = ntohs(th->th_urp); } static inline void tcp_fields_to_net(struct tcphdr *th) { th->th_seq = htonl(th->th_seq); th->th_ack = htonl(th->th_ack); th->th_win = htons(th->th_win); th->th_urp = htons(th->th_urp); } #endif /* _KERNEL */ #endif /* _NETINET_TCP_VAR_H_ */