diff --git a/sys/netinet/in_kdtrace.c b/sys/netinet/in_kdtrace.c index 2a53b11c3be2..8491a4d49d91 100644 --- a/sys/netinet/in_kdtrace.c +++ b/sys/netinet/in_kdtrace.c @@ -1,472 +1,473 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2013 Mark Johnston * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include SDT_PROVIDER_DEFINE(mib); SDT_PROVIDER_DEFINE(ip); SDT_PROVIDER_DEFINE(tcp); SDT_PROVIDER_DEFINE(udp); SDT_PROVIDER_DEFINE(udplite); #ifndef KDTRACE_NO_MIB_SDT #define MIB_PROBE_IP(name) \ SDT_PROBE_DEFINE1(mib, ip, count, name, \ "int") MIB_PROBE_IP(ips_total); MIB_PROBE_IP(ips_badsum); MIB_PROBE_IP(ips_tooshort); MIB_PROBE_IP(ips_toosmall); MIB_PROBE_IP(ips_badhlen); MIB_PROBE_IP(ips_badlen); MIB_PROBE_IP(ips_fragments); MIB_PROBE_IP(ips_fragdropped); MIB_PROBE_IP(ips_fragtimeout); MIB_PROBE_IP(ips_forward); MIB_PROBE_IP(ips_fastforward); MIB_PROBE_IP(ips_cantforward); MIB_PROBE_IP(ips_redirectsent); MIB_PROBE_IP(ips_noproto); MIB_PROBE_IP(ips_delivered); MIB_PROBE_IP(ips_localout); MIB_PROBE_IP(ips_odropped); MIB_PROBE_IP(ips_reassembled); MIB_PROBE_IP(ips_fragmented); MIB_PROBE_IP(ips_ofragments); MIB_PROBE_IP(ips_cantfrag); MIB_PROBE_IP(ips_badoptions); MIB_PROBE_IP(ips_noroute); MIB_PROBE_IP(ips_badvers); MIB_PROBE_IP(ips_rawout); MIB_PROBE_IP(ips_toolong); MIB_PROBE_IP(ips_notmember); MIB_PROBE_IP(ips_nogif); MIB_PROBE_IP(ips_badaddr); #define MIB_PROBE_IP6(name) \ SDT_PROBE_DEFINE1(mib, ip6, count, name, \ "int") #define MIB_PROBE2_IP6(name) \ SDT_PROBE_DEFINE2(mib, ip6, count, name, \ "int", "int") MIB_PROBE_IP6(ip6s_total); MIB_PROBE_IP6(ip6s_tooshort); MIB_PROBE_IP6(ip6s_toosmall); MIB_PROBE_IP6(ip6s_fragments); MIB_PROBE_IP6(ip6s_fragdropped); MIB_PROBE_IP6(ip6s_fragtimeout); MIB_PROBE_IP6(ip6s_fragoverflow); MIB_PROBE_IP6(ip6s_forward); MIB_PROBE_IP6(ip6s_cantforward); MIB_PROBE_IP6(ip6s_redirectsent); MIB_PROBE_IP6(ip6s_delivered); MIB_PROBE_IP6(ip6s_localout); MIB_PROBE_IP6(ip6s_odropped); MIB_PROBE_IP6(ip6s_reassembled); MIB_PROBE_IP6(ip6s_atomicfrags); MIB_PROBE_IP6(ip6s_fragmented); MIB_PROBE_IP6(ip6s_ofragments); MIB_PROBE_IP6(ip6s_cantfrag); MIB_PROBE_IP6(ip6s_badoptions); MIB_PROBE_IP6(ip6s_noroute); MIB_PROBE_IP6(ip6s_badvers); MIB_PROBE_IP6(ip6s_rawout); MIB_PROBE_IP6(ip6s_badscope); MIB_PROBE_IP6(ip6s_notmember); MIB_PROBE2_IP6(ip6s_nxthist); MIB_PROBE_IP6(ip6s_m1); MIB_PROBE2_IP6(ip6s_m2m); MIB_PROBE_IP6(ip6s_mext1); MIB_PROBE_IP6(ip6s_mext2m); MIB_PROBE_IP6(ip6s_exthdrtoolong); MIB_PROBE_IP6(ip6s_nogif); MIB_PROBE_IP6(ip6s_toomanyhdr); MIB_PROBE_IP6(ip6s_sources_none); MIB_PROBE2_IP6(ip6s_sources_sameif); MIB_PROBE2_IP6(ip6s_sources_otherif); MIB_PROBE2_IP6(ip6s_sources_samescope); MIB_PROBE2_IP6(ip6s_sources_otherscope); MIB_PROBE2_IP6(ip6s_sources_deprecated); MIB_PROBE2_IP6(ip6s_sources_rule); #define MIB_PROBE_ICMP(name) \ SDT_PROBE_DEFINE1(mib, icmp, count, name, \ "int") #define MIB_PROBE2_ICMP(name) \ SDT_PROBE_DEFINE2(mib, icmp, count, name, \ "int", "int") MIB_PROBE_ICMP(icps_error); MIB_PROBE_ICMP(icps_oldshort); MIB_PROBE_ICMP(icps_oldicmp); MIB_PROBE2_ICMP(icps_outhist); MIB_PROBE_ICMP(icps_badcode); MIB_PROBE_ICMP(icps_tooshort); MIB_PROBE_ICMP(icps_checksum); MIB_PROBE_ICMP(icps_badlen); MIB_PROBE_ICMP(icps_reflect); MIB_PROBE2_ICMP(icps_inhist); MIB_PROBE_ICMP(icps_bmcastecho); MIB_PROBE_ICMP(icps_bmcasttstamp); MIB_PROBE_ICMP(icps_badaddr); MIB_PROBE_ICMP(icps_noroute); #define MIB_PROBE_ICMP6(name) \ SDT_PROBE_DEFINE1(mib, icmp6, count, name, \ "int") #define MIB_PROBE2_ICMP6(name) \ SDT_PROBE_DEFINE2(mib, icmp6, count, name, \ "int", "int") MIB_PROBE_ICMP6(icp6s_error); MIB_PROBE_ICMP6(icp6s_canterror); MIB_PROBE_ICMP6(icp6s_toofreq); MIB_PROBE2_ICMP6(icp6s_outhist); MIB_PROBE_ICMP6(icp6s_badcode); MIB_PROBE_ICMP6(icp6s_tooshort); MIB_PROBE_ICMP6(icp6s_checksum); MIB_PROBE_ICMP6(icp6s_badlen); MIB_PROBE_ICMP6(icp6s_dropped); MIB_PROBE_ICMP6(icp6s_reflect); MIB_PROBE2_ICMP6(icp6s_inhist); MIB_PROBE_ICMP6(icp6s_nd_toomanyopt); MIB_PROBE_ICMP6(icp6s_odst_unreach_noroute); MIB_PROBE_ICMP6(icp6s_odst_unreach_admin); MIB_PROBE_ICMP6(icp6s_odst_unreach_beyondscope); MIB_PROBE_ICMP6(icp6s_odst_unreach_addr); MIB_PROBE_ICMP6(icp6s_odst_unreach_noport); MIB_PROBE_ICMP6(icp6s_opacket_too_big); MIB_PROBE_ICMP6(icp6s_otime_exceed_transit); MIB_PROBE_ICMP6(icp6s_otime_exceed_reassembly); MIB_PROBE_ICMP6(icp6s_oparamprob_header); MIB_PROBE_ICMP6(icp6s_oparamprob_nextheader); MIB_PROBE_ICMP6(icp6s_oparamprob_option); MIB_PROBE_ICMP6(icp6s_oredirect); MIB_PROBE_ICMP6(icp6s_ounknown); MIB_PROBE_ICMP6(icp6s_pmtuchg); MIB_PROBE_ICMP6(icp6s_nd_badopt); MIB_PROBE_ICMP6(icp6s_badns); MIB_PROBE_ICMP6(icp6s_badna); MIB_PROBE_ICMP6(icp6s_badrs); MIB_PROBE_ICMP6(icp6s_badra); MIB_PROBE_ICMP6(icp6s_badredirect); MIB_PROBE_ICMP6(icp6s_overflowdefrtr); MIB_PROBE_ICMP6(icp6s_overflowprfx); MIB_PROBE_ICMP6(icp6s_overflownndp); MIB_PROBE_ICMP6(icp6s_overflowredirect); MIB_PROBE_ICMP6(icp6s_invlhlim); #define MIB_PROBE_UDP(name) SDT_PROBE_DEFINE1(mib, udp, count, name, "int") MIB_PROBE_UDP(udps_ipackets); MIB_PROBE_UDP(udps_hdrops); MIB_PROBE_UDP(udps_badsum); MIB_PROBE_UDP(udps_nosum); MIB_PROBE_UDP(udps_badlen); MIB_PROBE_UDP(udps_noport); MIB_PROBE_UDP(udps_noportbcast); MIB_PROBE_UDP(udps_fullsock); MIB_PROBE_UDP(udps_pcbcachemiss); MIB_PROBE_UDP(udps_pcbhashmiss); MIB_PROBE_UDP(udps_opackets); MIB_PROBE_UDP(udps_fastout); MIB_PROBE_UDP(udps_noportmcast); MIB_PROBE_UDP(udps_filtermcast); #define MIB_PROBE_TCP(name) SDT_PROBE_DEFINE1(mib, tcp, count, name, "int") MIB_PROBE_TCP(tcps_connattempt); MIB_PROBE_TCP(tcps_accepts); MIB_PROBE_TCP(tcps_connects); MIB_PROBE_TCP(tcps_drops); MIB_PROBE_TCP(tcps_conndrops); MIB_PROBE_TCP(tcps_minmmsdrops); MIB_PROBE_TCP(tcps_closed); MIB_PROBE_TCP(tcps_segstimed); MIB_PROBE_TCP(tcps_rttupdated); MIB_PROBE_TCP(tcps_delack); MIB_PROBE_TCP(tcps_timeoutdrop); MIB_PROBE_TCP(tcps_rexmttimeo); MIB_PROBE_TCP(tcps_persisttimeo); MIB_PROBE_TCP(tcps_keeptimeo); MIB_PROBE_TCP(tcps_keepprobe); MIB_PROBE_TCP(tcps_keepdrops); MIB_PROBE_TCP(tcps_progdrops); MIB_PROBE_TCP(tcps_sndtotal); MIB_PROBE_TCP(tcps_sndpack); MIB_PROBE_TCP(tcps_sndbyte); MIB_PROBE_TCP(tcps_sndrexmitpack); MIB_PROBE_TCP(tcps_sndrexmitbyte); MIB_PROBE_TCP(tcps_sndrexmitbad); MIB_PROBE_TCP(tcps_sndacks); MIB_PROBE_TCP(tcps_sndprobe); MIB_PROBE_TCP(tcps_sndurg); MIB_PROBE_TCP(tcps_sndwinup); MIB_PROBE_TCP(tcps_sndctrl); MIB_PROBE_TCP(tcps_rcvtotal); MIB_PROBE_TCP(tcps_rcvpack); MIB_PROBE_TCP(tcps_rcvbyte); MIB_PROBE_TCP(tcps_rcvbadsum); MIB_PROBE_TCP(tcps_rcvbadoff); MIB_PROBE_TCP(tcps_rcvreassfull); MIB_PROBE_TCP(tcps_rcvshort); MIB_PROBE_TCP(tcps_rcvduppack); MIB_PROBE_TCP(tcps_rcvdupbyte); MIB_PROBE_TCP(tcps_rcvpartduppack); MIB_PROBE_TCP(tcps_rcvpartdupbyte); MIB_PROBE_TCP(tcps_rcvoopack); MIB_PROBE_TCP(tcps_rcvoobyte); MIB_PROBE_TCP(tcps_rcvpackafterwin); MIB_PROBE_TCP(tcps_rcvbyteafterwin); MIB_PROBE_TCP(tcps_rcvafterclose); MIB_PROBE_TCP(tcps_rcvwinprobe); MIB_PROBE_TCP(tcps_rcvdupack); MIB_PROBE_TCP(tcps_rcvacktoomuch); MIB_PROBE_TCP(tcps_rcvackpack); MIB_PROBE_TCP(tcps_rcvackbyte); MIB_PROBE_TCP(tcps_rcvwinupd); MIB_PROBE_TCP(tcps_pawsdrop); MIB_PROBE_TCP(tcps_predack); MIB_PROBE_TCP(tcps_preddat); MIB_PROBE_TCP(tcps_pcbackemiss); MIB_PROBE_TCP(tcps_cachedrtt); MIB_PROBE_TCP(tcps_cachedrttvar); MIB_PROBE_TCP(tcps_cachedssthresh); MIB_PROBE_TCP(tcps_usedrtt); MIB_PROBE_TCP(tcps_usedrttvar); MIB_PROBE_TCP(tcps_usedssthresh); MIB_PROBE_TCP(tcps_persistdrop); MIB_PROBE_TCP(tcps_badsyn); MIB_PROBE_TCP(tcps_mturesent); MIB_PROBE_TCP(tcps_listendrop); MIB_PROBE_TCP(tcps_badrst); MIB_PROBE_TCP(tcps_sc_added); MIB_PROBE_TCP(tcps_sc_retransmitted); MIB_PROBE_TCP(tcps_sc_dupsyn); MIB_PROBE_TCP(tcps_sc_dropped); MIB_PROBE_TCP(tcps_sc_completed); MIB_PROBE_TCP(tcps_sc_bucketoverflow); MIB_PROBE_TCP(tcps_sc_cacheoverflow); MIB_PROBE_TCP(tcps_sc_reset); MIB_PROBE_TCP(tcps_sc_stale); MIB_PROBE_TCP(tcps_sc_aborted); MIB_PROBE_TCP(tcps_sc_badack); MIB_PROBE_TCP(tcps_sc_unreach); MIB_PROBE_TCP(tcps_sc_zonefail); MIB_PROBE_TCP(tcps_sc_sendcookie); MIB_PROBE_TCP(tcps_sc_recvcookie); MIB_PROBE_TCP(tcps_hc_added); MIB_PROBE_TCP(tcps_hc_bucketoverflow); MIB_PROBE_TCP(tcps_finwait2_drops); MIB_PROBE_TCP(tcps_sack_recovery_episode); MIB_PROBE_TCP(tcps_sack_rexmits); +MIB_PROBE_TCP(tcps_sack_rexmits_tso); MIB_PROBE_TCP(tcps_sack_rexmit_bytes); MIB_PROBE_TCP(tcps_sack_rcv_blocks); MIB_PROBE_TCP(tcps_sack_send_blocks); MIB_PROBE_TCP(tcps_sack_lostrexmt); MIB_PROBE_TCP(tcps_sack_sboverflow); MIB_PROBE_TCP(tcps_ecn_rcvce); MIB_PROBE_TCP(tcps_ecn_rcvect0); MIB_PROBE_TCP(tcps_ecn_rcvect1); MIB_PROBE_TCP(tcps_ecn_shs); MIB_PROBE_TCP(tcps_ecn_rcwnd); MIB_PROBE_TCP(tcps_sig_rcvgoodsig); MIB_PROBE_TCP(tcps_sig_rcvbadsig); MIB_PROBE_TCP(tcps_sig_err_buildsig); MIB_PROBE_TCP(tcps_sig_err_sigopt); MIB_PROBE_TCP(tcps_sig_err_nosigopt); MIB_PROBE_TCP(tcps_pmtud_blackhole_activated); MIB_PROBE_TCP(tcps_pmtud_blackhole_activated_min_mss); MIB_PROBE_TCP(tcps_pmtud_blackhole_failed); MIB_PROBE_TCP(tcps_tunneled_pkts); MIB_PROBE_TCP(tcps_tunneled_errs); MIB_PROBE_TCP(tcps_dsack_count); MIB_PROBE_TCP(tcps_dsack_bytes); MIB_PROBE_TCP(tcps_dsack_tlp_bytes); MIB_PROBE_TCP(tcps_tw_recycles); MIB_PROBE_TCP(tcps_tw_resets); MIB_PROBE_TCP(tcps_tw_responds); MIB_PROBE_TCP(tcps_ace_nect); MIB_PROBE_TCP(tcps_ace_ect1); MIB_PROBE_TCP(tcps_ace_ect0); MIB_PROBE_TCP(tcps_ace_ce); MIB_PROBE_TCP(tcps_ecn_sndect0); MIB_PROBE_TCP(tcps_ecn_sndect1); MIB_PROBE_TCP(tcps_tlpresends); MIB_PROBE_TCP(tcps_tlpresend_bytes); #endif SDT_PROBE_DEFINE6_XLATE(ip, , , receive, "void *", "pktinfo_t *", "void *", "csinfo_t *", "uint8_t *", "ipinfo_t *", "struct ifnet *", "ifinfo_t *", "struct ip *", "ipv4info_t *", "struct ip6_hdr *", "ipv6info_t *"); SDT_PROBE_DEFINE6_XLATE(ip, , , send, "void *", "pktinfo_t *", "void *", "csinfo_t *", "uint8_t *", "ipinfo_t *", "struct ifnet *", "ifinfo_t *", "struct ip *", "ipv4info_t *", "struct ip6_hdr *", "ipv6info_t *"); SDT_PROBE_DEFINE5_XLATE(tcp, , , accept__established, "void *", "pktinfo_t *", "struct tcpcb *", "csinfo_t *", "struct mbuf *", "ipinfo_t *", "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfoh_t *"); SDT_PROBE_DEFINE5_XLATE(tcp, , , accept__refused, "void *", "pktinfo_t *", "struct tcpcb *", "csinfo_t *", "struct mbuf *", "ipinfo_t *", "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfo_t *"); SDT_PROBE_DEFINE5_XLATE(tcp, , , connect__established, "void *", "pktinfo_t *", "struct tcpcb *", "csinfo_t *", "struct mbuf *", "ipinfo_t *", "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfoh_t *"); SDT_PROBE_DEFINE5_XLATE(tcp, , , connect__refused, "void *", "pktinfo_t *", "struct tcpcb *", "csinfo_t *", "struct mbuf *", "ipinfo_t *", "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfoh_t *"); SDT_PROBE_DEFINE5_XLATE(tcp, , , connect__request, "void *", "pktinfo_t *", "struct tcpcb *", "csinfo_t *", "uint8_t *", "ipinfo_t *", "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfo_t *"); SDT_PROBE_DEFINE5_XLATE(tcp, , , receive, "void *", "pktinfo_t *", "struct tcpcb *", "csinfo_t *", "struct mbuf *", "ipinfo_t *", "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfoh_t *"); SDT_PROBE_DEFINE5_XLATE(tcp, , , send, "void *", "pktinfo_t *", "struct tcpcb *", "csinfo_t *", "uint8_t *", "ipinfo_t *", "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfo_t *"); SDT_PROBE_DEFINE1_XLATE(tcp, , , siftr, "struct pkt_node *", "siftrinfo_t *"); SDT_PROBE_DEFINE3_XLATE(tcp, , , debug__input, "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfoh_t *", "struct mbuf *", "ipinfo_t *"); SDT_PROBE_DEFINE3_XLATE(tcp, , , debug__output, "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfo_t *", "struct mbuf *", "ipinfo_t *"); SDT_PROBE_DEFINE2_XLATE(tcp, , , debug__user, "struct tcpcb *", "tcpsinfo_t *" , "int", "int"); SDT_PROBE_DEFINE3_XLATE(tcp, , , debug__drop, "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfoh_t *", "struct mbuf *", "ipinfo_t *"); SDT_PROBE_DEFINE6_XLATE(tcp, , , state__change, "void *", "void *", "struct tcpcb *", "csinfo_t *", "void *", "void *", "struct tcpcb *", "tcpsinfo_t *", "void *", "void *", "int", "tcplsinfo_t *"); SDT_PROBE_DEFINE6_XLATE(tcp, , , receive__autoresize, "void *", "void *", "struct tcpcb *", "csinfo_t *", "struct mbuf *", "ipinfo_t *", "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfoh_t *", "int", "int"); SDT_PROBE_DEFINE5_XLATE(udp, , , receive, "void *", "pktinfo_t *", "struct inpcb *", "csinfo_t *", "uint8_t *", "ipinfo_t *", "struct inpcb *", "udpsinfo_t *", "struct udphdr *", "udpinfo_t *"); SDT_PROBE_DEFINE5_XLATE(udp, , , send, "void *", "pktinfo_t *", "struct inpcb *", "csinfo_t *", "uint8_t *", "ipinfo_t *", "struct inpcb *", "udpsinfo_t *", "struct udphdr *", "udpinfo_t *"); SDT_PROBE_DEFINE5_XLATE(udplite, , , receive, "void *", "pktinfo_t *", "struct inpcb *", "csinfo_t *", "uint8_t *", "ipinfo_t *", "struct inpcb *", "udplitesinfo_t *", "struct udphdr *", "udpliteinfo_t *"); SDT_PROBE_DEFINE5_XLATE(udplite, , , send, "void *", "pktinfo_t *", "struct inpcb *", "csinfo_t *", "uint8_t *", "ipinfo_t *", "struct inpcb *", "udplitesinfo_t *", "struct udphdr *", "udpliteinfo_t *"); diff --git a/sys/netinet/in_kdtrace.h b/sys/netinet/in_kdtrace.h index 780839299993..9896af96eb84 100644 --- a/sys/netinet/in_kdtrace.h +++ b/sys/netinet/in_kdtrace.h @@ -1,409 +1,410 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2013 Mark Johnston * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _SYS_IN_KDTRACE_H_ #define _SYS_IN_KDTRACE_H_ #include #define IP_PROBE(probe, arg0, arg1, arg2, arg3, arg4, arg5) \ SDT_PROBE6(ip, , , probe, arg0, arg1, arg2, arg3, arg4, arg5) #define UDP_PROBE(probe, arg0, arg1, arg2, arg3, arg4) \ SDT_PROBE5(udp, , , probe, arg0, arg1, arg2, arg3, arg4) #define UDPLITE_PROBE(probe, arg0, arg1, arg2, arg3, arg4) \ SDT_PROBE5(udplite, , , probe, arg0, arg1, arg2, arg3, arg4) #define TCP_PROBE1(probe, arg0) \ SDT_PROBE1(tcp, , , probe, arg0) #define TCP_PROBE2(probe, arg0, arg1) \ SDT_PROBE2(tcp, , , probe, arg0, arg1) #define TCP_PROBE3(probe, arg0, arg1, arg2) \ SDT_PROBE3(tcp, , , probe, arg0, arg1, arg2) #define TCP_PROBE4(probe, arg0, arg1, arg2, arg3) \ SDT_PROBE4(tcp, , , probe, arg0, arg1, arg2, arg3) #define TCP_PROBE5(probe, arg0, arg1, arg2, arg3, arg4) \ SDT_PROBE5(tcp, , , probe, arg0, arg1, arg2, arg3, arg4) #define TCP_PROBE6(probe, arg0, arg1, arg2, arg3, arg4, arg5) \ SDT_PROBE6(tcp, , , probe, arg0, arg1, arg2, arg3, arg4, arg5) SDT_PROVIDER_DECLARE(ip); SDT_PROVIDER_DECLARE(tcp); SDT_PROVIDER_DECLARE(udp); SDT_PROVIDER_DECLARE(udplite); #ifndef KDTRACE_NO_MIB_SDT SDT_PROVIDER_DECLARE(mib); SDT_PROBE_DECLARE(mib, ip, count, ips_total); SDT_PROBE_DECLARE(mib, ip, count, ips_badsum); SDT_PROBE_DECLARE(mib, ip, count, ips_tooshort); SDT_PROBE_DECLARE(mib, ip, count, ips_toosmall); SDT_PROBE_DECLARE(mib, ip, count, ips_badhlen); SDT_PROBE_DECLARE(mib, ip, count, ips_badlen); SDT_PROBE_DECLARE(mib, ip, count, ips_fragments); SDT_PROBE_DECLARE(mib, ip, count, ips_fragdropped); SDT_PROBE_DECLARE(mib, ip, count, ips_fragtimeout); SDT_PROBE_DECLARE(mib, ip, count, ips_forward); SDT_PROBE_DECLARE(mib, ip, count, ips_fastforward); SDT_PROBE_DECLARE(mib, ip, count, ips_cantforward); SDT_PROBE_DECLARE(mib, ip, count, ips_redirectsent); SDT_PROBE_DECLARE(mib, ip, count, ips_noproto); SDT_PROBE_DECLARE(mib, ip, count, ips_delivered); SDT_PROBE_DECLARE(mib, ip, count, ips_localout); SDT_PROBE_DECLARE(mib, ip, count, ips_odropped); SDT_PROBE_DECLARE(mib, ip, count, ips_reassembled); SDT_PROBE_DECLARE(mib, ip, count, ips_fragmented); SDT_PROBE_DECLARE(mib, ip, count, ips_ofragments); SDT_PROBE_DECLARE(mib, ip, count, ips_cantfrag); SDT_PROBE_DECLARE(mib, ip, count, ips_badoptions); SDT_PROBE_DECLARE(mib, ip, count, ips_noroute); SDT_PROBE_DECLARE(mib, ip, count, ips_badvers); SDT_PROBE_DECLARE(mib, ip, count, ips_rawout); SDT_PROBE_DECLARE(mib, ip, count, ips_toolong); SDT_PROBE_DECLARE(mib, ip, count, ips_notmember); SDT_PROBE_DECLARE(mib, ip, count, ips_nogif); SDT_PROBE_DECLARE(mib, ip, count, ips_badaddr); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_total); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_tooshort); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_toosmall); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_fragments); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_fragdropped); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_fragtimeout); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_fragoverflow); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_forward); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_cantforward); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_redirectsent); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_delivered); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_localout); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_odropped); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_reassembled); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_atomicfrags); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_fragmented); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_ofragments); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_cantfrag); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_badoptions); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_noroute); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_badvers); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_rawout); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_badscope); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_notmember); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_nxthist); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_m1); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_m2m); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_mext1); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_mext2m); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_exthdrtoolong); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_nogif); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_toomanyhdr); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_sources_none); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_sources_sameif); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_sources_otherif); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_sources_samescope); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_sources_otherscope); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_sources_deprecated); SDT_PROBE_DECLARE(mib, ip6, count, ip6s_sources_rule); SDT_PROBE_DECLARE(mib, icmp, count, icps_error); SDT_PROBE_DECLARE(mib, icmp, count, icps_oldshort); SDT_PROBE_DECLARE(mib, icmp, count, icps_oldicmp); SDT_PROBE_DECLARE(mib, icmp, count, icps_outhist); SDT_PROBE_DECLARE(mib, icmp, count, icps_badcode); SDT_PROBE_DECLARE(mib, icmp, count, icps_tooshort); SDT_PROBE_DECLARE(mib, icmp, count, icps_checksum); SDT_PROBE_DECLARE(mib, icmp, count, icps_badlen); SDT_PROBE_DECLARE(mib, icmp, count, icps_reflect); SDT_PROBE_DECLARE(mib, icmp, count, icps_inhist); SDT_PROBE_DECLARE(mib, icmp, count, icps_bmcastecho); SDT_PROBE_DECLARE(mib, icmp, count, icps_bmcasttstamp); SDT_PROBE_DECLARE(mib, icmp, count, icps_badaddr); SDT_PROBE_DECLARE(mib, icmp, count, icps_noroute); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_error); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_canterror); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_toofreq); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_outhist); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_badcode); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_tooshort); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_checksum); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_badlen); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_dropped); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_reflect); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_inhist); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_nd_toomanyopt); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_odst_unreach_noroute); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_odst_unreach_admin); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_odst_unreach_beyondscope); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_odst_unreach_addr); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_odst_unreach_noport); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_opacket_too_big); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_otime_exceed_transit); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_otime_exceed_reassembly); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_oparamprob_header); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_oparamprob_nextheader); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_oparamprob_option); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_oredirect); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_ounknown); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_pmtuchg); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_nd_badopt); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_badns); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_badna); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_badrs); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_badra); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_badredirect); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_overflowdefrtr); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_overflowprfx); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_overflownndp); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_overflowredirect); SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_invlhlim); SDT_PROBE_DECLARE(mib, udp, count, udps_ipackets); SDT_PROBE_DECLARE(mib, udp, count, udps_hdrops); SDT_PROBE_DECLARE(mib, udp, count, udps_badsum); SDT_PROBE_DECLARE(mib, udp, count, udps_nosum); SDT_PROBE_DECLARE(mib, udp, count, udps_badlen); SDT_PROBE_DECLARE(mib, udp, count, udps_noport); SDT_PROBE_DECLARE(mib, udp, count, udps_noportbcast); SDT_PROBE_DECLARE(mib, udp, count, udps_fullsock); SDT_PROBE_DECLARE(mib, udp, count, udps_pcbcachemiss); SDT_PROBE_DECLARE(mib, udp, count, udps_pcbhashmiss); SDT_PROBE_DECLARE(mib, udp, count, udps_opackets); SDT_PROBE_DECLARE(mib, udp, count, udps_fastout); SDT_PROBE_DECLARE(mib, udp, count, udps_noportmcast); SDT_PROBE_DECLARE(mib, udp, count, udps_filtermcast); SDT_PROBE_DECLARE(mib, tcp, count, tcps_connattempt); SDT_PROBE_DECLARE(mib, tcp, count, tcps_accepts); SDT_PROBE_DECLARE(mib, tcp, count, tcps_connects); SDT_PROBE_DECLARE(mib, tcp, count, tcps_drops); SDT_PROBE_DECLARE(mib, tcp, count, tcps_conndrops); SDT_PROBE_DECLARE(mib, tcp, count, tcps_minmssdrops); SDT_PROBE_DECLARE(mib, tcp, count, tcps_closed); SDT_PROBE_DECLARE(mib, tcp, count, tcps_segstimed); SDT_PROBE_DECLARE(mib, tcp, count, tcps_rttupdated); SDT_PROBE_DECLARE(mib, tcp, count, tcps_delack); SDT_PROBE_DECLARE(mib, tcp, count, tcps_timeoutdrop); SDT_PROBE_DECLARE(mib, tcp, count, tcps_rexmttimeo); SDT_PROBE_DECLARE(mib, tcp, count, tcps_persisttimeo); SDT_PROBE_DECLARE(mib, tcp, count, tcps_keeptimeo); SDT_PROBE_DECLARE(mib, tcp, count, tcps_keepprobe); SDT_PROBE_DECLARE(mib, tcp, count, tcps_keepdrops); SDT_PROBE_DECLARE(mib, tcp, count, tcps_progdrops); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sndtotal); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sndpack); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sndbyte); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sndrexmitpack); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sndrexmitbyte); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sndrexmitbad); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sndacks); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sndprobe); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sndurg); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sndwinup); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sndctrl); SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvtotal); SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvpack); SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvbyte); SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvbadsum); SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvbadoff); SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvreassfull); SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvshort); SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvduppack); SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvdupbyte); SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvpartduppack); SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvpartdupbyte); SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvoopack); SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvoobyte); SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvpackafterwin); SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvbyteafterwin); SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvafterclose); SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvwinprobe); SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvdupack); SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvacktoomuch); SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvackpack); SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvackbyte); SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvwinupd); SDT_PROBE_DECLARE(mib, tcp, count, tcps_pawsdrop); SDT_PROBE_DECLARE(mib, tcp, count, tcps_predack); SDT_PROBE_DECLARE(mib, tcp, count, tcps_preddat); SDT_PROBE_DECLARE(mib, tcp, count, tcps_pcbcachemiss); SDT_PROBE_DECLARE(mib, tcp, count, tcps_cachedrtt); SDT_PROBE_DECLARE(mib, tcp, count, tcps_cachedrttvar); SDT_PROBE_DECLARE(mib, tcp, count, tcps_cachedssthresh); SDT_PROBE_DECLARE(mib, tcp, count, tcps_usedrtt); SDT_PROBE_DECLARE(mib, tcp, count, tcps_usedrttvar); SDT_PROBE_DECLARE(mib, tcp, count, tcps_usedssthresh); SDT_PROBE_DECLARE(mib, tcp, count, tcps_persistdrop); SDT_PROBE_DECLARE(mib, tcp, count, tcps_badsyn); SDT_PROBE_DECLARE(mib, tcp, count, tcps_mturesent); SDT_PROBE_DECLARE(mib, tcp, count, tcps_listendrop); SDT_PROBE_DECLARE(mib, tcp, count, tcps_badrst); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_added); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_retransmitted); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_dupsyn); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_dropped); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_completed); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_bucketoverflow); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_cacheoverflow); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_reset); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_stale); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_aborted); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_badack); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_unreach); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_zonefail); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_sendcookie); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_recvcookie); SDT_PROBE_DECLARE(mib, tcp, count, tcps_hc_added); SDT_PROBE_DECLARE(mib, tcp, count, tcps_hc_bucketoverflow); SDT_PROBE_DECLARE(mib, tcp, count, tcps_finwait2_drops); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sack_recovery_episode); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sack_rexmits); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sack_rexmits_tso); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sack_rexmit_bytes); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sack_rcv_blocks); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sack_send_blocks); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sack_lostrexmt); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sack_sboverflow); SDT_PROBE_DECLARE(mib, tcp, count, tcps_ecn_rcvce); SDT_PROBE_DECLARE(mib, tcp, count, tcps_ecn_rcvect0); SDT_PROBE_DECLARE(mib, tcp, count, tcps_ecn_rcvect1); SDT_PROBE_DECLARE(mib, tcp, count, tcps_ecn_shs); SDT_PROBE_DECLARE(mib, tcp, count, tcps_ecn_rcwnd); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sig_rcvgoodsig); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sig_rcvbadsig); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sig_err_buildsig); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sig_err_sigopt); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sig_err_nosigopt); SDT_PROBE_DECLARE(mib, tcp, count, tcps_pmtud_blackhole_activated); SDT_PROBE_DECLARE(mib, tcp, count, tcps_pmtud_blackhole_activated_min_mss); SDT_PROBE_DECLARE(mib, tcp, count, tcps_pmtud_blackhole_failed); SDT_PROBE_DECLARE(mib, tcp, count, tcps_tunneled_pkts); SDT_PROBE_DECLARE(mib, tcp, count, tcps_tunneled_errs); SDT_PROBE_DECLARE(mib, tcp, count, tcps_dsack_count); SDT_PROBE_DECLARE(mib, tcp, count, tcps_dsack_bytes); SDT_PROBE_DECLARE(mib, tcp, count, tcps_dsack_tlp_bytes); SDT_PROBE_DECLARE(mib, tcp, count, tcps_tw_recycles); SDT_PROBE_DECLARE(mib, tcp, count, tcps_tw_resets); SDT_PROBE_DECLARE(mib, tcp, count, tcps_tw_responds); SDT_PROBE_DECLARE(mib, tcp, count, tcps_ace_nect); SDT_PROBE_DECLARE(mib, tcp, count, tcps_ace_ect1); SDT_PROBE_DECLARE(mib, tcp, count, tcps_ace_ect0); SDT_PROBE_DECLARE(mib, tcp, count, tcps_ace_ce); SDT_PROBE_DECLARE(mib, tcp, count, tcps_ecn_sndect0); SDT_PROBE_DECLARE(mib, tcp, count, tcps_ecn_sndect1); SDT_PROBE_DECLARE(mib, tcp, count, tcps_tlpresends); SDT_PROBE_DECLARE(mib, tcp, count, tcps_tlpresend_bytes); #endif SDT_PROBE_DECLARE(ip, , , receive); SDT_PROBE_DECLARE(ip, , , send); SDT_PROBE_DECLARE(tcp, , , accept__established); SDT_PROBE_DECLARE(tcp, , , accept__refused); SDT_PROBE_DECLARE(tcp, , , connect__established); SDT_PROBE_DECLARE(tcp, , , connect__refused); SDT_PROBE_DECLARE(tcp, , , connect__request); SDT_PROBE_DECLARE(tcp, , , receive); SDT_PROBE_DECLARE(tcp, , , send); SDT_PROBE_DECLARE(tcp, , , siftr); SDT_PROBE_DECLARE(tcp, , , state__change); SDT_PROBE_DECLARE(tcp, , , debug__input); SDT_PROBE_DECLARE(tcp, , , debug__output); SDT_PROBE_DECLARE(tcp, , , debug__user); SDT_PROBE_DECLARE(tcp, , , debug__drop); SDT_PROBE_DECLARE(tcp, , , receive__autoresize); SDT_PROBE_DECLARE(udp, , , receive); SDT_PROBE_DECLARE(udp, , , send); SDT_PROBE_DECLARE(udplite, , , receive); SDT_PROBE_DECLARE(udplite, , , send); /* * These constants originate from the 4.4BSD sys/protosw.h. They lost * their initial purpose in 2c37256e5a59, when single pr_usrreq method * was split into multiple methods. However, they were used by TCPDEBUG, * a feature barely used, but it kept them in the tree for many years. * In 5d06879adb95 DTrace probes started to use them. Note that they * are not documented in dtrace_tcp(4), so they are likely to be * eventually renamed to something better and extended/trimmed. */ #define PRU_ATTACH 0 /* attach protocol to up */ #define PRU_DETACH 1 /* detach protocol from up */ #define PRU_BIND 2 /* bind socket to address */ #define PRU_LISTEN 3 /* listen for connection */ #define PRU_CONNECT 4 /* establish connection to peer */ #define PRU_ACCEPT 5 /* accept connection from peer */ #define PRU_DISCONNECT 6 /* disconnect from peer */ #define PRU_SHUTDOWN 7 /* won't send any more data */ #define PRU_RCVD 8 /* have taken data; more room now */ #define PRU_SEND 9 /* send this data */ #define PRU_ABORT 10 /* abort (fast DISCONNECT, DETATCH) */ #define PRU_CONTROL 11 /* control operations on protocol */ #define PRU_SENSE 12 /* return status into m */ #define PRU_RCVOOB 13 /* retrieve out of band data */ #define PRU_SENDOOB 14 /* send out of band data */ #define PRU_SOCKADDR 15 /* fetch socket's address */ #define PRU_PEERADDR 16 /* fetch peer's address */ #define PRU_CONNECT2 17 /* connect two sockets */ /* begin for protocols internal use */ #define PRU_FASTTIMO 18 /* 200ms timeout */ #define PRU_SLOWTIMO 19 /* 500ms timeout */ #define PRU_PROTORCV 20 /* receive from below */ #define PRU_PROTOSEND 21 /* send to below */ /* end for protocol's internal use */ #define PRU_SEND_EOF 22 /* send and close */ #define PRU_SOSETLABEL 23 /* MAC label change */ #define PRU_CLOSE 24 /* socket close */ #define PRU_FLUSH 25 /* flush the socket */ #define PRU_NREQ 25 #ifdef PRUREQUESTS const char *prurequests[] = { "ATTACH", "DETACH", "BIND", "LISTEN", "CONNECT", "ACCEPT", "DISCONNECT", "SHUTDOWN", "RCVD", "SEND", "ABORT", "CONTROL", "SENSE", "RCVOOB", "SENDOOB", "SOCKADDR", "PEERADDR", "CONNECT2", "FASTTIMO", "SLOWTIMO", "PROTORCV", "PROTOSEND", "SEND_EOF", "SOSETLABEL", "CLOSE", "FLUSH", }; #endif #endif diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c index e64b7701c453..c318e8517c2e 100644 --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -1,2170 +1,2173 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_kern_tls.h" #include #include #include #include #ifdef TCP_HHOOK #include #endif #include #ifdef KERN_TLS #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #endif #include #define TCPOUTFLAGS #include #include #include #include #include #include #include #include #include #ifdef TCPPCAP #include #endif #ifdef TCP_OFFLOAD #include #endif #include #include #include #include #include #include VNET_DEFINE(int, path_mtu_discovery) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(path_mtu_discovery), 1, "Enable Path MTU Discovery"); VNET_DEFINE(int, tcp_do_tso) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_tso), 0, "Enable TCP Segmentation Offload"); VNET_DEFINE(int, tcp_sendspace) = 1024*32; #define V_tcp_sendspace VNET(tcp_sendspace) SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sendspace), 0, "Initial send socket buffer size"); VNET_DEFINE(int, tcp_do_autosndbuf) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_autosndbuf), 0, "Enable automatic send buffer sizing"); VNET_DEFINE(int, tcp_autosndbuf_inc) = 8*1024; SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autosndbuf_inc), 0, "Incrementor step size of automatic send buffer"); VNET_DEFINE(int, tcp_autosndbuf_max) = 2*1024*1024; SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autosndbuf_max), 0, "Max size of automatic send buffer"); VNET_DEFINE(int, tcp_sendbuf_auto_lowat) = 0; #define V_tcp_sendbuf_auto_lowat VNET(tcp_sendbuf_auto_lowat) SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto_lowat, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sendbuf_auto_lowat), 0, "Modify threshold for auto send buffer growth to account for SO_SNDLOWAT"); /* * Make sure that either retransmit or persist timer is set for SYN, FIN and * non-ACK. */ #define TCP_XMIT_TIMER_ASSERT(tp, len, th_flags) \ KASSERT(((len) == 0 && ((th_flags) & (TH_SYN | TH_FIN)) == 0) ||\ tcp_timer_active((tp), TT_REXMT) || \ tcp_timer_active((tp), TT_PERSIST), \ ("neither rexmt nor persist timer is set")) #ifdef TCP_HHOOK /* * Wrapper for the TCP established output helper hook. */ void hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, uint32_t len, int tso) { struct tcp_hhook_data hhook_data; if (V_tcp_hhh[HHOOK_TCP_EST_OUT]->hhh_nhooks > 0) { hhook_data.tp = tp; hhook_data.th = th; hhook_data.to = to; hhook_data.len = len; hhook_data.tso = tso; hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_OUT], &hhook_data, &tp->t_osd); } } #endif /* * CC wrapper hook functions */ void cc_after_idle(struct tcpcb *tp) { INP_WLOCK_ASSERT(tptoinpcb(tp)); if (CC_ALGO(tp)->after_idle != NULL) CC_ALGO(tp)->after_idle(&tp->t_ccv); } /* * Tcp output routine: figure out what should be sent and send it. */ int tcp_default_output(struct tcpcb *tp) { struct socket *so = tptosocket(tp); struct inpcb *inp = tptoinpcb(tp); int32_t len; uint32_t recwin, sendwin; uint16_t flags; int off, error = 0; /* Keep compiler happy */ u_int if_hw_tsomaxsegcount = 0; u_int if_hw_tsomaxsegsize = 0; struct mbuf *m; struct ip *ip = NULL; struct tcphdr *th; u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen, ulen; #if defined(IPSEC) || defined(IPSEC_SUPPORT) unsigned ipsec_optlen = 0; #endif int idle, sendalot, curticks; int sack_rxmit, sack_bytes_rxmt; struct sackhole *p; int tso, mtu; struct tcpopt to; struct udphdr *udp = NULL; struct tcp_log_buffer *lgb; unsigned int wanted_cookie = 0; unsigned int dont_sendalot = 0; #if 0 int maxburst = TCP_MAXBURST; #endif #ifdef INET6 struct ip6_hdr *ip6 = NULL; const bool isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif #ifdef KERN_TLS const bool hw_tls = tp->t_nic_ktls_xmit != 0; #else const bool hw_tls = false; #endif NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(inp); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) return (tcp_offload_output(tp)); #endif /* * For TFO connections in SYN_SENT or SYN_RECEIVED, * only allow the initial SYN or SYN|ACK and those sent * by the retransmit timer. */ if ((tp->t_flags & TF_FASTOPEN) && ((tp->t_state == TCPS_SYN_SENT) || (tp->t_state == TCPS_SYN_RECEIVED)) && SEQ_GT(tp->snd_max, tp->snd_una) && /* SYN or SYN|ACK sent */ (tp->snd_nxt != tp->snd_una)) /* not a retransmit */ return (0); /* * Determine length of data that should be transmitted, * and flags that will be used. * If there is some data or critical controls (SYN, RST) * to send, then transmit; otherwise, investigate further. */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); if (idle && (((ticks - tp->t_rcvtime) >= tp->t_rxtcur) || (tp->t_sndtime && ((ticks - tp->t_sndtime) >= tp->t_rxtcur)))) cc_after_idle(tp); tp->t_flags &= ~TF_LASTIDLE; if (idle) { if (tp->t_flags & TF_MORETOCOME) { tp->t_flags |= TF_LASTIDLE; idle = 0; } } again: /* * If we've recently taken a timeout, snd_max will be greater than * snd_nxt. There may be SACK information that allows us to avoid * resending already delivered data. Adjust snd_nxt accordingly. */ if ((tp->t_flags & TF_SACK_PERMIT) && SEQ_LT(tp->snd_nxt, tp->snd_max)) tcp_sack_adjust(tp); sendalot = 0; tso = 0; mtu = 0; off = tp->snd_nxt - tp->snd_una; sendwin = min(tp->snd_wnd, tp->snd_cwnd); flags = tcp_outflags[tp->t_state]; /* * Send any SACK-generated retransmissions. If we're explicitly trying * to send out new data (when sendalot is 1), bypass this function. * If we retransmit in fast recovery mode, decrement snd_cwnd, since * we're replacing a (future) new transmission with a retransmission * now, and we previously incremented snd_cwnd in tcp_input(). */ /* * Still in sack recovery , reset rxmit flag to zero. */ sack_rxmit = 0; sack_bytes_rxmt = 0; len = 0; p = NULL; if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags) && (p = tcp_sack_output(tp, &sack_bytes_rxmt))) { uint32_t cwin; cwin = imax(min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt, 0); /* Do not retransmit SACK segments beyond snd_recover */ if (SEQ_GT(p->end, tp->snd_recover)) { /* * (At least) part of sack hole extends beyond * snd_recover. Check to see if we can rexmit data * for this hole. */ if (SEQ_GEQ(p->rxmit, tp->snd_recover)) { /* * Can't rexmit any more data for this hole. * That data will be rexmitted in the next * sack recovery episode, when snd_recover * moves past p->rxmit. */ p = NULL; goto after_sack_rexmit; } else { /* Can rexmit part of the current hole */ len = ((int32_t)ulmin(cwin, SEQ_SUB(tp->snd_recover, p->rxmit))); } } else { len = ((int32_t)ulmin(cwin, SEQ_SUB(p->end, p->rxmit))); } if (len > 0) { off = SEQ_SUB(p->rxmit, tp->snd_una); KASSERT(off >= 0,("%s: sack block to the left of una : %d", __func__, off)); sack_rxmit = 1; sendalot = 1; } } after_sack_rexmit: /* * Get standard flags, and add SYN or FIN if requested by 'hidden' * state flags. */ if (tp->t_flags & TF_NEEDFIN) flags |= TH_FIN; if (tp->t_flags & TF_NEEDSYN) flags |= TH_SYN; SOCKBUF_LOCK(&so->so_snd); /* * If in persist timeout with window of 0, send 1 byte. * Otherwise, if window is small but nonzero * and timer expired, we will send what we can * and go to transmit state. */ if (tp->t_flags & TF_FORCEDATA) { if (sendwin == 0) { /* * If we still have some data to send, then * clear the FIN bit. Usually this would * happen below when it realizes that we * aren't sending all the data. However, * if we have exactly 1 byte of unsent data, * then it won't clear the FIN bit below, * and if we are in persist state, we wind * up sending the packet without recording * that we sent the FIN bit. * * We can't just blindly clear the FIN bit, * because if we don't have any more data * to send then the probe will be the FIN * itself. */ if (off < sbused(&so->so_snd)) flags &= ~TH_FIN; sendwin = 1; } else { tcp_timer_activate(tp, TT_PERSIST, 0); tp->t_rxtshift = 0; } } /* * If snd_nxt == snd_max and we have transmitted a FIN, the * offset will be > 0 even if so_snd.sb_cc is 0, resulting in * a negative length. This can also occur when TCP opens up * its congestion window while receiving additional duplicate * acks after fast-retransmit because TCP will reset snd_nxt * to snd_max after the fast-retransmit. * * In the normal retransmit-FIN-only case, however, snd_nxt will * be set to snd_una, the offset will be 0, and the length may * wind up 0. * * If sack_rxmit is true we are retransmitting from the scoreboard * in which case len is already set. */ if (sack_rxmit == 0) { if (sack_bytes_rxmt == 0) { len = ((int32_t)min(sbavail(&so->so_snd), sendwin) - off); } else { int32_t cwin; /* * We are inside of a SACK recovery episode and are * sending new data, having retransmitted all the * data possible in the scoreboard. */ len = ((int32_t)min(sbavail(&so->so_snd), tp->snd_wnd) - off); /* * Don't remove this (len > 0) check ! * We explicitly check for len > 0 here (although it * isn't really necessary), to work around a gcc * optimization issue - to force gcc to compute * len above. Without this check, the computation * of len is bungled by the optimizer. */ if (len > 0) { cwin = tp->snd_cwnd - imax(0, (int32_t) (tp->snd_nxt - tp->snd_recover)) - sack_bytes_rxmt; if (cwin < 0) cwin = 0; len = imin(len, cwin); } } } /* * Lop off SYN bit if it has already been sent. However, if this * is SYN-SENT state and if segment contains data and if we don't * know that foreign host supports TAO, suppress sending segment. */ if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { if (tp->t_state != TCPS_SYN_RECEIVED) flags &= ~TH_SYN; /* * When sending additional segments following a TFO SYN|ACK, * do not include the SYN bit. */ if ((tp->t_flags & TF_FASTOPEN) && (tp->t_state == TCPS_SYN_RECEIVED)) flags &= ~TH_SYN; off--, len++; } /* * Be careful not to send data and/or FIN on SYN segments. * This measure is needed to prevent interoperability problems * with not fully conformant TCP implementations. */ if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { len = 0; flags &= ~TH_FIN; } /* * On TFO sockets, ensure no data is sent in the following cases: * * - When retransmitting SYN|ACK on a passively-created socket * * - When retransmitting SYN on an actively created socket * * - When sending a zero-length cookie (cookie request) on an * actively created socket * * - When the socket is in the CLOSED state (RST is being sent) */ if ((tp->t_flags & TF_FASTOPEN) && (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || ((tp->t_state == TCPS_SYN_SENT) && (tp->t_tfo_client_cookie_len == 0)) || (flags & TH_RST))) len = 0; /* Without fast-open there should never be data sent on a SYN. */ if ((flags & TH_SYN) && !(tp->t_flags & TF_FASTOPEN)) { len = 0; } if (len <= 0) { /* * If FIN has been sent but not acked, * but we haven't been called to retransmit, * len will be < 0. Otherwise, window shrank * after we sent into it. If window shrank to 0, * cancel pending retransmit, pull snd_nxt back * to (closed) window, and set the persist timer * if it isn't already going. If the window didn't * close completely, just wait for an ACK. * * We also do a general check here to ensure that * we will set the persist timer when we have data * to send, but a 0-byte window. This makes sure * the persist timer is set even if the packet * hits one of the "goto send" lines below. */ len = 0; if ((sendwin == 0) && (TCPS_HAVEESTABLISHED(tp->t_state)) && (off < (int) sbavail(&so->so_snd)) && !tcp_timer_active(tp, TT_PERSIST)) { tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rxtshift = 0; tp->snd_nxt = tp->snd_una; if (!tcp_timer_active(tp, TT_PERSIST)) tcp_setpersist(tp); } } /* len will be >= 0 after this point. */ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); tcp_sndbuf_autoscale(tp, so, sendwin); /* * Decide if we can use TCP Segmentation Offloading (if supported by * hardware). * * TSO may only be used if we are in a pure bulk sending state. The * presence of TCP-MD5, SACK retransmits, SACK advertizements and * IP options prevent using TSO. With TSO the TCP header is the same * (except for the sequence number) for all generated packets. This * makes it impossible to transmit any options which vary per generated * segment or packet. * * IPv4 handling has a clear separation of ip options and ip header * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does * the right thing below to provide length of just ip options and thus * checking for ipoptlen is enough to decide if ip options are present. */ #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * Pre-calculate here as we save another lookup into the darknesses * of IPsec that way and can actually decide if TSO is ok. */ #ifdef INET6 if (isipv6 && IPSEC_ENABLED(ipv6)) ipsec_optlen = IPSEC_HDRSIZE(ipv6, inp); #ifdef INET else #endif #endif /* INET6 */ #ifdef INET if (IPSEC_ENABLED(ipv4)) ipsec_optlen = IPSEC_HDRSIZE(ipv4, inp); #endif /* INET */ #endif /* IPSEC */ #ifdef INET6 if (isipv6) ipoptlen = ip6_optlen(inp); else #endif if (inp->inp_options) ipoptlen = inp->inp_options->m_len - offsetof(struct ipoption, ipopt_list); else ipoptlen = 0; #if defined(IPSEC) || defined(IPSEC_SUPPORT) ipoptlen += ipsec_optlen; #endif if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg && (tp->t_port == 0) && ((tp->t_flags & TF_SIGNATURE) == 0) && tp->rcv_numsacks == 0 && ((sack_rxmit == 0) || V_tcp_sack_tso) && ipoptlen == 0 && !(flags & TH_SYN)) tso = 1; if (SEQ_LT((sack_rxmit ? p->rxmit : tp->snd_nxt) + len, tp->snd_una + sbused(&so->so_snd))) { flags &= ~TH_FIN; } recwin = lmin(lmax(sbspace(&so->so_rcv), 0), (long)TCP_MAXWIN << tp->rcv_scale); /* * Sender silly window avoidance. We transmit under the following * conditions when len is non-zero: * * - We have a full segment (or more with TSO) * - This is the last buffer in a write()/send() and we are * either idle or running NODELAY * - we've timed out (e.g. persist timer) * - we have more then 1/2 the maximum send window's worth of * data (receiver may be limited the window size) * - we need to retransmit */ if (len) { if (len >= tp->t_maxseg) goto send; /* * As the TCP header options are now * considered when setting up the initial * window, we would not send the last segment * if we skip considering the option length here. * Note: this may not work when tcp headers change * very dynamically in the future. */ if ((((tp->t_flags & TF_SIGNATURE) ? PADTCPOLEN(TCPOLEN_SIGNATURE) : 0) + ((tp->t_flags & TF_RCVD_TSTMP) ? PADTCPOLEN(TCPOLEN_TIMESTAMP) : 0) + len) >= tp->t_maxseg) goto send; /* * NOTE! on localhost connections an 'ack' from the remote * end may occur synchronously with the output and cause * us to flush a buffer queued with moretocome. XXX * * note: the len + off check is almost certainly unnecessary. */ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ (idle || (tp->t_flags & TF_NODELAY)) && (uint32_t)len + (uint32_t)off >= sbavail(&so->so_snd) && (tp->t_flags & TF_NOPUSH) == 0) { goto send; } if (tp->t_flags & TF_FORCEDATA) /* typ. timeout case */ goto send; if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */ goto send; if (sack_rxmit) goto send; } /* * Sending of standalone window updates. * * Window updates are important when we close our window due to a * full socket buffer and are opening it again after the application * reads data from it. Once the window has opened again and the * remote end starts to send again the ACK clock takes over and * provides the most current window information. * * We must avoid the silly window syndrome whereas every read * from the receive buffer, no matter how small, causes a window * update to be sent. We also should avoid sending a flurry of * window updates when the socket buffer had queued a lot of data * and the application is doing small reads. * * Prevent a flurry of pointless window updates by only sending * an update when we can increase the advertized window by more * than 1/4th of the socket buffer capacity. When the buffer is * getting full or is very small be more aggressive and send an * update whenever we can increase by two mss sized segments. * In all other situations the ACK's to new incoming data will * carry further window increases. * * Don't send an independent window update if a delayed * ACK is pending (it will get piggy-backed on it) or the * remote side already has done a half-close and won't send * more data. */ if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && !(tp->t_flags & TF_DELACK) && !TCPS_HAVERCVDFIN(tp->t_state)) { /* * "adv" is the amount we could increase the window, * taking into account that we are limited by * TCP_MAXWIN << tp->rcv_scale. */ int32_t adv; int oldwin; adv = recwin; if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { oldwin = (tp->rcv_adv - tp->rcv_nxt); if (adv > oldwin) adv -= oldwin; else adv = 0; } else oldwin = 0; /* * If the new window size ends up being the same as or less * than the old size when it is scaled, then don't force * a window update. */ if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale) goto dontupdate; if (adv >= (int32_t)(2 * tp->t_maxseg) && (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || recwin <= (so->so_rcv.sb_hiwat / 8) || so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg || adv >= TCP_MAXWIN << tp->rcv_scale)) goto send; if (2 * adv >= (int32_t)so->so_rcv.sb_hiwat) goto send; } dontupdate: /* * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW * is also a catch-all for the retransmit timer timeout case. */ if (tp->t_flags & TF_ACKNOW) goto send; if ((flags & TH_RST) || ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) goto send; if (SEQ_GT(tp->snd_up, tp->snd_una)) goto send; /* * If our state indicates that FIN should be sent * and we have not yet done so, then we need to send. */ if (flags & TH_FIN && ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) goto send; /* * In SACK, it is possible for tcp_output to fail to send a segment * after the retransmission timer has been turned off. Make sure * that the retransmission timer is set. */ if ((tp->t_flags & TF_SACK_PERMIT) && SEQ_GT(tp->snd_max, tp->snd_una) && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { tcp_timer_activate(tp, TT_REXMT, TP_RXTCUR(tp)); goto just_return; } /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window * updates. The three ``states'' for the output side are: * idle not doing retransmits or persists * persisting to move a small or zero window * (re)transmitting and thereby not persisting * * tcp_timer_active(tp, TT_PERSIST) * is true when we are in persist state. * (tp->t_flags & TF_FORCEDATA) * is set when we are called to send a persist packet. * tcp_timer_active(tp, TT_REXMT) * is set when we are retransmitting * The output side is idle when both timers are zero. * * If send window is too small, there is data to transmit, and no * retransmit or persist is pending, then go to persist state. * If nothing happens soon, send when timer expires: * if window is nonzero, transmit what we can, * otherwise force out a byte. */ if (sbavail(&so->so_snd) && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { tp->t_rxtshift = 0; tcp_setpersist(tp); } /* * No reason to send a segment, just return. */ just_return: SOCKBUF_UNLOCK(&so->so_snd); return (0); send: SOCKBUF_LOCK_ASSERT(&so->so_snd); if (len > 0) { if (len >= tp->t_maxseg) tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; else tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; } /* * Before ESTABLISHED, force sending of initial options * unless TCP set not to do any options. * NOTE: we assume that the IP/TCP header plus TCP options * always fit in a single mbuf, leaving room for a maximum * link header, i.e. * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES */ optlen = 0; #ifdef INET6 if (isipv6) hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); else #endif hdrlen = sizeof (struct tcpiphdr); if (flags & TH_SYN) { tp->snd_nxt = tp->iss; } /* * Compute options for segment. * We only have to care about SYN and established connection * segments. Options for SYN-ACK segments are handled in TCP * syncache. */ to.to_flags = 0; if ((tp->t_flags & TF_NOOPT) == 0) { /* Maximum segment size. */ if (flags & TH_SYN) { to.to_mss = tcp_mssopt(&inp->inp_inc); if (tp->t_port) to.to_mss -= V_tcp_udp_tunneling_overhead; to.to_flags |= TOF_MSS; /* * On SYN or SYN|ACK transmits on TFO connections, * only include the TFO option if it is not a * retransmit, as the presence of the TFO option may * have caused the original SYN or SYN|ACK to have * been dropped by a middlebox. */ if ((tp->t_flags & TF_FASTOPEN) && (tp->t_rxtshift == 0)) { if (tp->t_state == TCPS_SYN_RECEIVED) { to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; to.to_tfo_cookie = (u_int8_t *)&tp->t_tfo_cookie.server; to.to_flags |= TOF_FASTOPEN; wanted_cookie = 1; } else if (tp->t_state == TCPS_SYN_SENT) { to.to_tfo_len = tp->t_tfo_client_cookie_len; to.to_tfo_cookie = tp->t_tfo_cookie.client; to.to_flags |= TOF_FASTOPEN; wanted_cookie = 1; /* * If we wind up having more data to * send with the SYN than can fit in * one segment, don't send any more * until the SYN|ACK comes back from * the other end. */ dont_sendalot = 1; } } } /* Window scaling. */ if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { to.to_wscale = tp->request_r_scale; to.to_flags |= TOF_SCALE; } /* Timestamps. */ if ((tp->t_flags & TF_RCVD_TSTMP) || ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { curticks = tcp_ts_getticks(); to.to_tsval = curticks + tp->ts_offset; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; if (tp->t_rxtshift == 1) tp->t_badrxtwin = curticks; } /* Set receive buffer autosizing timestamp. */ if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE)) tp->rfbuf_ts = tcp_ts_getticks(); /* Selective ACK's. */ if (tp->t_flags & TF_SACK_PERMIT) { if (flags & TH_SYN) to.to_flags |= TOF_SACKPERM; else if (TCPS_HAVEESTABLISHED(tp->t_state) && tp->rcv_numsacks > 0) { to.to_flags |= TOF_SACK; to.to_nsacks = tp->rcv_numsacks; to.to_sacks = (u_char *)tp->sackblks; } } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* TCP-MD5 (RFC2385). */ /* * Check that TCP_MD5SIG is enabled in tcpcb to * account the size needed to set this TCP option. */ if (tp->t_flags & TF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif /* TCP_SIGNATURE */ /* Processing the options. */ hdrlen += optlen = tcp_addoptions(&to, opt); /* * If we wanted a TFO option to be added, but it was unable * to fit, ensure no data is sent. */ if ((tp->t_flags & TF_FASTOPEN) && wanted_cookie && !(to.to_flags & TOF_FASTOPEN)) len = 0; } if (tp->t_port) { if (V_tcp_udp_tunneling_port == 0) { /* The port was removed?? */ SOCKBUF_UNLOCK(&so->so_snd); return (EHOSTUNREACH); } hdrlen += sizeof(struct udphdr); } /* * Adjust data length if insertion of options will * bump the packet length beyond the t_maxseg length. * Clear the FIN bit because we cut off the tail of * the segment. */ if (len + optlen + ipoptlen > tp->t_maxseg) { flags &= ~TH_FIN; if (tso) { u_int if_hw_tsomax; u_int moff; int max_len; /* extract TSO information */ if_hw_tsomax = tp->t_tsomax; if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; /* * Limit a TSO burst to prevent it from * overflowing or exceeding the maximum length * allowed by the network interface: */ KASSERT(ipoptlen == 0, ("%s: TSO can't do IP options", __func__)); /* * Check if we should limit by maximum payload * length: */ if (if_hw_tsomax != 0) { /* compute maximum TSO length */ max_len = (if_hw_tsomax - hdrlen - max_linkhdr); if (max_len <= 0) { len = 0; } else if (len > max_len) { sendalot = 1; len = max_len; } } /* * Prevent the last segment from being * fractional unless the send sockbuf can be * emptied: */ max_len = (tp->t_maxseg - optlen); if (((uint32_t)off + (uint32_t)len) < sbavail(&so->so_snd)) { moff = len % max_len; if (moff != 0) { len -= moff; sendalot = 1; } } /* * In case there are too many small fragments * don't use TSO: */ if (len <= max_len) { len = max_len; sendalot = 1; tso = 0; } /* * Send the FIN in a separate segment * after the bulk sending is done. * We don't trust the TSO implementations * to clear the FIN flag on all but the * last segment. */ if (tp->t_flags & TF_NEEDFIN) sendalot = 1; } else { if (optlen + ipoptlen >= tp->t_maxseg) { /* * Since we don't have enough space to put * the IP header chain and the TCP header in * one packet as required by RFC 7112, don't * send it. Also ensure that at least one * byte of the payload can be put into the * TCP segment. */ SOCKBUF_UNLOCK(&so->so_snd); error = EMSGSIZE; sack_rxmit = 0; goto out; } len = tp->t_maxseg - optlen - ipoptlen; sendalot = 1; if (dont_sendalot) sendalot = 0; } } else tso = 0; KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, ("%s: len > IP_MAXPACKET", __func__)); /*#ifdef DIAGNOSTIC*/ #ifdef INET6 if (max_linkhdr + hdrlen > MCLBYTES) #else if (max_linkhdr + hdrlen > MHLEN) #endif panic("tcphdr too big"); /*#endif*/ /* * This KASSERT is here to catch edge cases at a well defined place. * Before, those had triggered (random) panic conditions further down. */ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); /* * Grab a header mbuf, attaching a copy of data to * be transmitted, and initialize the header from * the template for sends on this connection. */ if (len) { struct mbuf *mb; struct sockbuf *msb; u_int moff; if ((tp->t_flags & TF_FORCEDATA) && len == 1) { TCPSTAT_INC(tcps_sndprobe); #ifdef STATS if (SEQ_LT(tp->snd_nxt, tp->snd_max)) stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, len); else stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, len); #endif /* STATS */ } else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { tp->t_sndrexmitpack++; TCPSTAT_INC(tcps_sndrexmitpack); TCPSTAT_ADD(tcps_sndrexmitbyte, len); if (sack_rxmit) { TCPSTAT_INC(tcps_sack_rexmits); + if (tso) { + TCPSTAT_INC(tcps_sack_rexmits_tso); + } TCPSTAT_ADD(tcps_sack_rexmit_bytes, len); } #ifdef STATS stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, len); #endif /* STATS */ } else { TCPSTAT_INC(tcps_sndpack); TCPSTAT_ADD(tcps_sndbyte, len); #ifdef STATS stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, len); #endif /* STATS */ } #ifdef INET6 if (MHLEN < hdrlen + max_linkhdr) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else #endif m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOBUFS; sack_rxmit = 0; goto out; } m->m_data += max_linkhdr; m->m_len = hdrlen; /* * Start the m_copy functions from the closest mbuf * to the offset in the socket buffer chain. */ mb = sbsndptr_noadv(&so->so_snd, off, &moff); if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) { m_copydata(mb, moff, len, mtod(m, caddr_t) + hdrlen); if (SEQ_LT(tp->snd_nxt, tp->snd_max)) sbsndptr_adv(&so->so_snd, mb, len); m->m_len += len; } else { if (SEQ_LT(tp->snd_nxt, tp->snd_max)) msb = NULL; else msb = &so->so_snd; m->m_next = tcp_m_copym(mb, moff, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, hw_tls); if (len <= (tp->t_maxseg - optlen)) { /* * Must have ran out of mbufs for the copy * shorten it to no longer need tso. Lets * not put on sendalot since we are low on * mbufs. */ tso = 0; } if (m->m_next == NULL) { SOCKBUF_UNLOCK(&so->so_snd); (void) m_free(m); error = ENOBUFS; sack_rxmit = 0; goto out; } } /* * If we're sending everything we've got, set PUSH. * (This will keep happy those implementations which only * give data to the user when a buffer fills or * a PUSH comes in.) */ if (((uint32_t)off + (uint32_t)len == sbused(&so->so_snd)) && !(flags & TH_SYN)) flags |= TH_PUSH; SOCKBUF_UNLOCK(&so->so_snd); } else { SOCKBUF_UNLOCK(&so->so_snd); if (tp->t_flags & TF_ACKNOW) TCPSTAT_INC(tcps_sndacks); else if (flags & (TH_SYN|TH_FIN|TH_RST)) TCPSTAT_INC(tcps_sndctrl); else if (SEQ_GT(tp->snd_up, tp->snd_una)) TCPSTAT_INC(tcps_sndurg); else TCPSTAT_INC(tcps_sndwinup); m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; sack_rxmit = 0; goto out; } #ifdef INET6 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && MHLEN >= hdrlen) { M_ALIGN(m, hdrlen); } else #endif m->m_data += max_linkhdr; m->m_len = hdrlen; } SOCKBUF_UNLOCK_ASSERT(&so->so_snd); m->m_pkthdr.rcvif = (struct ifnet *)0; #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif #ifdef INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); if (tp->t_port) { udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr)); udp->uh_sport = htons(V_tcp_udp_tunneling_port); udp->uh_dport = tp->t_port; ulen = hdrlen + len - sizeof(struct ip6_hdr); udp->uh_ulen = htons(ulen); th = (struct tcphdr *)(udp + 1); } else { th = (struct tcphdr *)(ip6 + 1); } tcpip_fillheaders(inp, tp->t_port, ip6, th); } else #endif /* INET6 */ { ip = mtod(m, struct ip *); if (tp->t_port) { udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip)); udp->uh_sport = htons(V_tcp_udp_tunneling_port); udp->uh_dport = tp->t_port; ulen = hdrlen + len - sizeof(struct ip); udp->uh_ulen = htons(ulen); th = (struct tcphdr *)(udp + 1); } else th = (struct tcphdr *)(ip + 1); tcpip_fillheaders(inp, tp->t_port, ip, th); } /* * Fill in fields, remembering maximum advertised * window for use in delaying messages about window sizes. * If resending a FIN, be sure not to use a new sequence number. */ if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && tp->snd_nxt == tp->snd_max) tp->snd_nxt--; /* * If we are starting a connection, send ECN setup * SYN packet. If we are on a retransmit, we may * resend those bits a number of times as per * RFC 3168. */ if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) { flags |= tcp_ecn_output_syn_sent(tp); } /* Also handle parallel SYN for ECN */ if ((TCPS_HAVERCVDSYN(tp->t_state)) && (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { int ect = tcp_ecn_output_established(tp, &flags, len, sack_rxmit); if ((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_flags2 & TF2_ECN_SND_ECE)) tp->t_flags2 &= ~TF2_ECN_SND_ECE; #ifdef INET6 if (isipv6) { ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << IPV6_FLOWLABEL_LEN); ip6->ip6_flow |= htonl(ect << IPV6_FLOWLABEL_LEN); } else #endif { ip->ip_tos &= ~IPTOS_ECN_MASK; ip->ip_tos |= ect; } } /* * If we are doing retransmissions, then snd_nxt will * not reflect the first unsent octet. For ACK only * packets, we do not want the sequence number of the * retransmitted packet, we want the sequence number * of the next unsent octet. So, if there is no data * (and no SYN or FIN), use snd_max instead of snd_nxt * when filling in ti_seq. But if we are in persist * state, snd_max might reflect one byte beyond the * right edge of the window, so use snd_nxt in that * case, since we know we aren't doing a retransmission. * (retransmit and persist are mutually exclusive...) */ if (sack_rxmit == 0) { if (len || (flags & (TH_SYN|TH_FIN)) || tcp_timer_active(tp, TT_PERSIST)) th->th_seq = htonl(tp->snd_nxt); else th->th_seq = htonl(tp->snd_max); } else { th->th_seq = htonl(p->rxmit); p->rxmit += len; /* * Lost Retransmission Detection * trigger resending of a (then * still existing) hole, when * fack acks recoverypoint. */ if ((tp->t_flags & TF_LRD) && SEQ_GEQ(p->rxmit, p->end)) p->rxmit = tp->snd_recover; tp->sackhint.sack_bytes_rexmit += len; } if (IN_RECOVERY(tp->t_flags)) { /* * Account all bytes transmitted while * IN_RECOVERY, simplifying PRR and * Lost Retransmit Detection */ tp->sackhint.prr_out += len; } th->th_ack = htonl(tp->rcv_nxt); if (optlen) { bcopy(opt, th + 1, optlen); th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; } tcp_set_flags(th, flags); /* * Calculate receive window. Don't shrink window, * but avoid silly window syndrome. * If a RST segment is sent, advertise a window of zero. */ if (flags & TH_RST) { recwin = 0; } else { if (recwin < (so->so_rcv.sb_hiwat / 4) && recwin < tp->t_maxseg) recwin = 0; if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && recwin < (tp->rcv_adv - tp->rcv_nxt)) recwin = (tp->rcv_adv - tp->rcv_nxt); } /* * According to RFC1323 the window field in a SYN (i.e., a * or ) segment itself is never scaled. The * case is handled in syncache. */ if (flags & TH_SYN) th->th_win = htons((u_short) (min(sbspace(&so->so_rcv), TCP_MAXWIN))); else { /* Avoid shrinking window with window scaling. */ recwin = roundup2(recwin, 1 << tp->rcv_scale); th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); } /* * Adjust the RXWIN0SENT flag - indicate that we have advertised * a 0 window. This may cause the remote transmitter to stall. This * flag tells soreceive() to disable delayed acknowledgements when * draining the buffer. This can occur if the receiver is attempting * to read more data than can be buffered prior to transmitting on * the connection. */ if (th->th_win == 0) { tp->t_sndzerowin++; tp->t_flags |= TF_RXWIN0SENT; } else tp->t_flags &= ~TF_RXWIN0SENT; if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); th->th_flags |= TH_URG; } else /* * If no urgent pointer to send, then we pull * the urgent pointer to the left edge of the send window * so that it doesn't drift into the send window on sequence * number wraparound. */ tp->snd_up = tp->snd_una; /* drag it along */ /* * Put TCP length in extended header, and then * checksum extended header and data. */ m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (to.to_flags & TOF_SIGNATURE) { /* * Calculate MD5 signature and put it into the place * determined before. * NOTE: since TCP options buffer doesn't point into * mbuf's data, calculate offset and use it. */ if (!TCPMD5_ENABLED() || (error = TCPMD5_OUTPUT(m, th, (u_char *)(th + 1) + (to.to_signature - opt))) != 0) { /* * Do not send segment if the calculation of MD5 * digest has failed. */ m_freem(m); goto out; } } #endif #ifdef INET6 if (isipv6) { /* * There is no need to fill in ip6_plen right now. * It will be filled later by ip6_output. */ if (tp->t_port) { m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); th->th_sum = htons(0); UDPSTAT_INC(udps_opackets); } else { m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 0); } } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { if (tp->t_port) { m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); udp->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); th->th_sum = htons(0); UDPSTAT_INC(udps_opackets); } else { m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen)); } /* IP version must be set here for ipv4/ipv6 checking later */ KASSERT(ip->ip_v == IPVERSION, ("%s: IP version incorrect: %d", __func__, ip->ip_v)); } #endif /* * Enable TSO and specify the size of the segments. * The TCP pseudo header checksum is always provided. */ if (tso) { KASSERT(len > tp->t_maxseg - optlen, ("%s: len <= tso_segsz", __func__)); m->m_pkthdr.csum_flags |= CSUM_TSO; m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; } KASSERT(len + hdrlen == m_length(m, NULL), ("%s: mbuf chain shorter than expected: %d + %u != %u", __func__, len, hdrlen, m_length(m, NULL))); #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ hhook_run_tcp_est_out(tp, th, &to, len, tso); #endif TCP_PROBE3(debug__output, tp, th, m); /* We're getting ready to send; log now. */ /* XXXMT: We are not honoring verbose logging. */ if (tcp_bblogging_on(tp)) lgb = tcp_log_event(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, len, NULL, false, NULL, NULL, 0, NULL); else lgb = NULL; /* * Fill in IP length and desired time to live and * send to IP level. There should be a better way * to handle ttl and tos; we could keep them in * the template, but need a way to checksum without them. */ /* * m->m_pkthdr.len should have been set before checksum calculation, * because in6_cksum() need it. */ #ifdef INET6 if (isipv6) { /* * we separately set hoplimit for every segment, since the * user might want to change the value via setsockopt. * Also, desired default hop limit might be changed via * Neighbor Discovery. */ ip6->ip6_hlim = in6_selecthlim(inp, NULL); /* * Set the packet size here for the benefit of DTrace probes. * ip6_output() will set it properly; it's supposed to include * the option header lengths as well. */ ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) tp->t_flags2 |= TF2_PLPMTU_PMTUD; else tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; if (tp->t_state == TCPS_SYN_SENT) TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); TCP_PROBE5(send, NULL, tp, ip6, tp, th); #ifdef TCPPCAP /* Save packet, if requested. */ tcp_pcap_add(th, m, &(tp->t_outpkts)); #endif /* TODO: IPv6 IP6TOS_ECT bit on */ error = ip6_output(m, inp->in6p_outputopts, &inp->inp_route6, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), NULL, NULL, inp); if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL) mtu = inp->inp_route6.ro_nh->nh_mtu; } #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET { ip->ip_len = htons(m->m_pkthdr.len); #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) ip->ip_ttl = in6_selecthlim(inp, NULL); #endif /* INET6 */ /* * If we do path MTU discovery, then we set DF on every packet. * This might not be the best thing to do according to RFC3390 * Section 2. However the tcp hostcache migitates the problem * so it affects only the first tcp connection with a host. * * NB: Don't set DF on small MTU/MSS to have a safe fallback. */ if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { tp->t_flags2 |= TF2_PLPMTU_PMTUD; if (tp->t_port == 0 || len < V_tcp_minmss) { ip->ip_off |= htons(IP_DF); } } else { tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; } if (tp->t_state == TCPS_SYN_SENT) TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); TCP_PROBE5(send, NULL, tp, ip, tp, th); #ifdef TCPPCAP /* Save packet, if requested. */ tcp_pcap_add(th, m, &(tp->t_outpkts)); #endif error = ip_output(m, inp->inp_options, &inp->inp_route, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, inp); if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL) mtu = inp->inp_route.ro_nh->nh_mtu; } #endif /* INET */ if (lgb != NULL) { lgb->tlb_errno = error; lgb = NULL; } out: if (error == 0) tcp_account_for_send(tp, len, (tp->snd_nxt != tp->snd_max), 0, hw_tls); /* * In transmit state, time the transmission and arrange for * the retransmit. In persist state, just set snd_max. In a closed * state just return. */ if (flags & TH_RST) { TCPSTAT_INC(tcps_sndtotal); return (0); } else if ((tp->t_flags & TF_FORCEDATA) == 0 || !tcp_timer_active(tp, TT_PERSIST)) { tcp_seq startseq = tp->snd_nxt; /* * Advance snd_nxt over sequence space of this segment. */ if (flags & (TH_SYN|TH_FIN)) { if (flags & TH_SYN) tp->snd_nxt++; if (flags & TH_FIN) { tp->snd_nxt++; tp->t_flags |= TF_SENTFIN; } } if (sack_rxmit) goto timer; tp->snd_nxt += len; if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { /* * Update "made progress" indication if we just * added new data to an empty socket buffer. */ if (tp->snd_una == tp->snd_max) tp->t_acktime = ticks; tp->snd_max = tp->snd_nxt; /* * Time this transmission if not a retransmission and * not currently timing anything. */ tp->t_sndtime = ticks; if (tp->t_rtttime == 0) { tp->t_rtttime = ticks; tp->t_rtseq = startseq; TCPSTAT_INC(tcps_segstimed); } #ifdef STATS if (!(tp->t_flags & TF_GPUTINPROG) && len) { tp->t_flags |= TF_GPUTINPROG; tp->gput_seq = startseq; tp->gput_ack = startseq + ulmin(sbavail(&so->so_snd) - off, sendwin); tp->gput_ts = tcp_ts_getticks(); } #endif /* STATS */ } /* * Set retransmit timer if not currently set, * and not doing a pure ack or a keep-alive probe. * Initial value for retransmit timer is smoothed * round-trip time + 2 * round-trip time variance. * Initialize shift counter which is used for backoff * of retransmit time. */ timer: if (!tcp_timer_active(tp, TT_REXMT) && ((sack_rxmit && tp->snd_nxt != tp->snd_max) || (tp->snd_nxt != tp->snd_una))) { if (tcp_timer_active(tp, TT_PERSIST)) { tcp_timer_activate(tp, TT_PERSIST, 0); tp->t_rxtshift = 0; } tcp_timer_activate(tp, TT_REXMT, TP_RXTCUR(tp)); } else if (len == 0 && sbavail(&so->so_snd) && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { /* * Avoid a situation where we do not set persist timer * after a zero window condition. For example: * 1) A -> B: packet with enough data to fill the window * 2) B -> A: ACK for #1 + new data (0 window * advertisement) * 3) A -> B: ACK for #2, 0 len packet * * In this case, A will not activate the persist timer, * because it chose to send a packet. Unless tcp_output * is called for some other reason (delayed ack timer, * another input packet from B, socket syscall), A will * not send zero window probes. * * So, if you send a 0-length packet, but there is data * in the socket buffer, and neither the rexmt or * persist timer is already set, then activate the * persist timer. */ tp->t_rxtshift = 0; tcp_setpersist(tp); } } else { /* * Persist case, update snd_max but since we are in * persist mode (no window) we do not update snd_nxt. */ int xlen = len; if (flags & TH_SYN) ++xlen; if (flags & TH_FIN) { ++xlen; tp->t_flags |= TF_SENTFIN; } if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) tp->snd_max = tp->snd_nxt + xlen; } if ((error == 0) && (TCPS_HAVEESTABLISHED(tp->t_state) && (tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0)) { /* Clean up any DSACK's sent */ tcp_clean_dsack_blocks(tp); } if (error) { /* * We know that the packet was lost, so back out the * sequence number advance, if any. * * If the error is EPERM the packet got blocked by the * local firewall. Normally we should terminate the * connection but the blocking may have been spurious * due to a firewall reconfiguration cycle. So we treat * it like a packet loss and let the retransmit timer and * timeouts do their work over time. * XXX: It is a POLA question whether calling tcp_drop right * away would be the really correct behavior instead. */ if (((tp->t_flags & TF_FORCEDATA) == 0 || !tcp_timer_active(tp, TT_PERSIST)) && ((flags & TH_SYN) == 0) && (error != EPERM)) { if (sack_rxmit) { p->rxmit = SEQ_MIN(p->end, p->rxmit) - len; tp->sackhint.sack_bytes_rexmit -= len; KASSERT(tp->sackhint.sack_bytes_rexmit >= 0, ("sackhint bytes rtx >= 0")); KASSERT((flags & TH_FIN) == 0, ("error while FIN with SACK rxmit")); } else { tp->snd_nxt -= len; if (flags & TH_FIN) tp->snd_nxt--; } if (IN_RECOVERY(tp->t_flags)) tp->sackhint.prr_out -= len; } SOCKBUF_UNLOCK_ASSERT(&so->so_snd); /* Check gotos. */ switch (error) { case EACCES: case EPERM: tp->t_softerror = error; return (error); case ENOBUFS: TCP_XMIT_TIMER_ASSERT(tp, len, flags); tp->snd_cwnd = tp->t_maxseg; return (0); case EMSGSIZE: /* * For some reason the interface we used initially * to send segments changed to another or lowered * its MTU. * If TSO was active we either got an interface * without TSO capabilits or TSO was turned off. * If we obtained mtu from ip_output() then update * it and try again. */ if (tso) tp->t_flags &= ~TF_TSO; if (mtu != 0) { tcp_mss_update(tp, -1, mtu, NULL, NULL); goto again; } return (error); case EHOSTDOWN: case EHOSTUNREACH: case ENETDOWN: case ENETUNREACH: if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_softerror = error; return (0); } /* FALLTHROUGH */ default: return (error); } } TCPSTAT_INC(tcps_sndtotal); /* * Data sent (as far as we can tell). * If this advertises a larger window than any other segment, * then remember the size of the advertised window. * Any pending ACK has now been sent. */ if (SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) tp->rcv_adv = tp->rcv_nxt + recwin; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); if (tcp_timer_active(tp, TT_DELACK)) tcp_timer_activate(tp, TT_DELACK, 0); if (sendalot) goto again; return (0); } void tcp_setpersist(struct tcpcb *tp) { int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; int tt; int maxunacktime; tp->t_flags &= ~TF_PREVVALID; if (tcp_timer_active(tp, TT_REXMT)) panic("tcp_setpersist: retransmit pending"); /* * If the state is already closed, don't bother. */ if (tp->t_state == TCPS_CLOSED) return; /* * Start/restart persistence timer. */ TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], tcp_persmin, tcp_persmax); if (TP_MAXUNACKTIME(tp) && tp->t_acktime) { maxunacktime = tp->t_acktime + TP_MAXUNACKTIME(tp) - ticks; if (maxunacktime < 1) maxunacktime = 1; if (maxunacktime < tt) tt = maxunacktime; } tcp_timer_activate(tp, TT_PERSIST, tt); if (tp->t_rxtshift < V_tcp_retries) tp->t_rxtshift++; } /* * Insert TCP options according to the supplied parameters to the place * optp in a consistent way. Can handle unaligned destinations. * * The order of the option processing is crucial for optimal packing and * alignment for the scarce option space. * * The optimal order for a SYN/SYN-ACK segment is: * MSS (4) + NOP (1) + Window scale (3) + SACK permitted (2) + * Timestamp (10) + Signature (18) = 38 bytes out of a maximum of 40. * * The SACK options should be last. SACK blocks consume 8*n+2 bytes. * So a full size SACK blocks option is 34 bytes (with 4 SACK blocks). * At minimum we need 10 bytes (to generate 1 SACK block). If both * TCP Timestamps (12 bytes) and TCP Signatures (18 bytes) are present, * we only have 10 bytes for SACK options (40 - (12 + 18)). */ int tcp_addoptions(struct tcpopt *to, u_char *optp) { u_int32_t mask, optlen = 0; for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) { if ((to->to_flags & mask) != mask) continue; if (optlen == TCP_MAXOLEN) break; switch (to->to_flags & mask) { case TOF_MSS: while (optlen % 4) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_MAXSEG) continue; optlen += TCPOLEN_MAXSEG; *optp++ = TCPOPT_MAXSEG; *optp++ = TCPOLEN_MAXSEG; to->to_mss = htons(to->to_mss); bcopy((u_char *)&to->to_mss, optp, sizeof(to->to_mss)); optp += sizeof(to->to_mss); break; case TOF_SCALE: while (!optlen || optlen % 2 != 1) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_WINDOW) continue; optlen += TCPOLEN_WINDOW; *optp++ = TCPOPT_WINDOW; *optp++ = TCPOLEN_WINDOW; *optp++ = to->to_wscale; break; case TOF_SACKPERM: while (optlen % 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_SACK_PERMITTED) continue; optlen += TCPOLEN_SACK_PERMITTED; *optp++ = TCPOPT_SACK_PERMITTED; *optp++ = TCPOLEN_SACK_PERMITTED; break; case TOF_TS: while (!optlen || optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_TIMESTAMP) continue; optlen += TCPOLEN_TIMESTAMP; *optp++ = TCPOPT_TIMESTAMP; *optp++ = TCPOLEN_TIMESTAMP; to->to_tsval = htonl(to->to_tsval); to->to_tsecr = htonl(to->to_tsecr); bcopy((u_char *)&to->to_tsval, optp, sizeof(to->to_tsval)); optp += sizeof(to->to_tsval); bcopy((u_char *)&to->to_tsecr, optp, sizeof(to->to_tsecr)); optp += sizeof(to->to_tsecr); break; case TOF_SIGNATURE: { int siglen = TCPOLEN_SIGNATURE - 2; while (!optlen || optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_SIGNATURE) { to->to_flags &= ~TOF_SIGNATURE; continue; } optlen += TCPOLEN_SIGNATURE; *optp++ = TCPOPT_SIGNATURE; *optp++ = TCPOLEN_SIGNATURE; to->to_signature = optp; while (siglen--) *optp++ = 0; break; } case TOF_SACK: { int sackblks = 0; struct sackblk *sack = (struct sackblk *)to->to_sacks; tcp_seq sack_seq; while (!optlen || optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_SACKHDR + TCPOLEN_SACK) continue; optlen += TCPOLEN_SACKHDR; *optp++ = TCPOPT_SACK; sackblks = min(to->to_nsacks, (TCP_MAXOLEN - optlen) / TCPOLEN_SACK); *optp++ = TCPOLEN_SACKHDR + sackblks * TCPOLEN_SACK; while (sackblks--) { sack_seq = htonl(sack->start); bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq)); optp += sizeof(sack_seq); sack_seq = htonl(sack->end); bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq)); optp += sizeof(sack_seq); optlen += TCPOLEN_SACK; sack++; } TCPSTAT_INC(tcps_sack_send_blocks); break; } case TOF_FASTOPEN: { int total_len; /* XXX is there any point to aligning this option? */ total_len = TCPOLEN_FAST_OPEN_EMPTY + to->to_tfo_len; if (TCP_MAXOLEN - optlen < total_len) { to->to_flags &= ~TOF_FASTOPEN; continue; } *optp++ = TCPOPT_FAST_OPEN; *optp++ = total_len; if (to->to_tfo_len > 0) { bcopy(to->to_tfo_cookie, optp, to->to_tfo_len); optp += to->to_tfo_len; } optlen += total_len; break; } default: panic("%s: unknown TCP option type", __func__); break; } } /* Terminate and pad TCP options to a 4 byte boundary. */ if (optlen % 4) { optlen += TCPOLEN_EOL; *optp++ = TCPOPT_EOL; } /* * According to RFC 793 (STD0007): * "The content of the header beyond the End-of-Option option * must be header padding (i.e., zero)." * and later: "The padding is composed of zeros." */ while (optlen % 4) { optlen += TCPOLEN_PAD; *optp++ = TCPOPT_PAD; } KASSERT(optlen <= TCP_MAXOLEN, ("%s: TCP options too long", __func__)); return (optlen); } /* * This is a copy of m_copym(), taking the TSO segment size/limit * constraints into account, and advancing the sndptr as it goes. */ struct mbuf * tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen, int32_t seglimit, int32_t segsize, struct sockbuf *sb, bool hw_tls) { #ifdef KERN_TLS struct ktls_session *tls, *ntls; struct mbuf *start __diagused; #endif struct mbuf *n, **np; struct mbuf *top; int32_t off = off0; int32_t len = *plen; int32_t fragsize; int32_t len_cp = 0; int32_t *pkthdrlen; uint32_t mlen, frags; bool copyhdr; KASSERT(off >= 0, ("tcp_m_copym, negative off %d", off)); KASSERT(len >= 0, ("tcp_m_copym, negative len %d", len)); if (off == 0 && m->m_flags & M_PKTHDR) copyhdr = true; else copyhdr = false; while (off > 0) { KASSERT(m != NULL, ("tcp_m_copym, offset > size of mbuf chain")); if (off < m->m_len) break; off -= m->m_len; if ((sb) && (m == sb->sb_sndptr)) { sb->sb_sndptroff += m->m_len; sb->sb_sndptr = m->m_next; } m = m->m_next; } np = ⊤ top = NULL; pkthdrlen = NULL; #ifdef KERN_TLS if (hw_tls && (m->m_flags & M_EXTPG)) tls = m->m_epg_tls; else tls = NULL; start = m; #endif while (len > 0) { if (m == NULL) { KASSERT(len == M_COPYALL, ("tcp_m_copym, length > size of mbuf chain")); *plen = len_cp; if (pkthdrlen != NULL) *pkthdrlen = len_cp; break; } #ifdef KERN_TLS if (hw_tls) { if (m->m_flags & M_EXTPG) ntls = m->m_epg_tls; else ntls = NULL; /* * Avoid mixing TLS records with handshake * data or TLS records from different * sessions. */ if (tls != ntls) { MPASS(m != start); *plen = len_cp; if (pkthdrlen != NULL) *pkthdrlen = len_cp; break; } } #endif mlen = min(len, m->m_len - off); if (seglimit) { /* * For M_EXTPG mbufs, add 3 segments * + 1 in case we are crossing page boundaries * + 2 in case the TLS hdr/trailer are used * It is cheaper to just add the segments * than it is to take the cache miss to look * at the mbuf ext_pgs state in detail. */ if (m->m_flags & M_EXTPG) { fragsize = min(segsize, PAGE_SIZE); frags = 3; } else { fragsize = segsize; frags = 0; } /* Break if we really can't fit anymore. */ if ((frags + 1) >= seglimit) { *plen = len_cp; if (pkthdrlen != NULL) *pkthdrlen = len_cp; break; } /* * Reduce size if you can't copy the whole * mbuf. If we can't copy the whole mbuf, also * adjust len so the loop will end after this * mbuf. */ if ((frags + howmany(mlen, fragsize)) >= seglimit) { mlen = (seglimit - frags - 1) * fragsize; len = mlen; *plen = len_cp + len; if (pkthdrlen != NULL) *pkthdrlen = *plen; } frags += howmany(mlen, fragsize); if (frags == 0) frags++; seglimit -= frags; KASSERT(seglimit > 0, ("%s: seglimit went too low", __func__)); } if (copyhdr) n = m_gethdr(M_NOWAIT, m->m_type); else n = m_get(M_NOWAIT, m->m_type); *np = n; if (n == NULL) goto nospace; if (copyhdr) { if (!m_dup_pkthdr(n, m, M_NOWAIT)) goto nospace; if (len == M_COPYALL) n->m_pkthdr.len -= off0; else n->m_pkthdr.len = len; pkthdrlen = &n->m_pkthdr.len; copyhdr = false; } n->m_len = mlen; len_cp += n->m_len; if (m->m_flags & (M_EXT | M_EXTPG)) { n->m_data = m->m_data + off; mb_dupcl(n, m); } else bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), (u_int)n->m_len); if (sb && (sb->sb_sndptr == m) && ((n->m_len + off) >= m->m_len) && m->m_next) { sb->sb_sndptroff += m->m_len; sb->sb_sndptr = m->m_next; } off = 0; if (len != M_COPYALL) { len -= n->m_len; } m = m->m_next; np = &n->m_next; } return (top); nospace: m_freem(top); return (NULL); } void tcp_sndbuf_autoscale(struct tcpcb *tp, struct socket *so, uint32_t sendwin) { /* * Automatic sizing of send socket buffer. Often the send buffer * size is not optimally adjusted to the actual network conditions * at hand (delay bandwidth product). Setting the buffer size too * small limits throughput on links with high bandwidth and high * delay (eg. trans-continental/oceanic links). Setting the * buffer size too big consumes too much real kernel memory, * especially with many connections on busy servers. * * The criteria to step up the send buffer one notch are: * 1. receive window of remote host is larger than send buffer * (with a fudge factor of 5/4th); * 2. send buffer is filled to 7/8th with data (so we actually * have data to make use of it); * 3. send buffer fill has not hit maximal automatic size; * 4. our send window (slow start and cogestion controlled) is * larger than sent but unacknowledged data in send buffer. * * The remote host receive window scaling factor may limit the * growing of the send buffer before it reaches its allowed * maximum. * * It scales directly with slow start or congestion window * and does at most one step per received ACK. This fast * scaling has the drawback of growing the send buffer beyond * what is strictly necessary to make full use of a given * delay*bandwidth product. However testing has shown this not * to be much of an problem. At worst we are trading wasting * of available bandwidth (the non-use of it) for wasting some * socket buffer memory. * * TODO: Shrink send buffer during idle periods together * with congestion window. Requires another timer. Has to * wait for upcoming tcp timer rewrite. * * XXXGL: should there be used sbused() or sbavail()? */ if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { int lowat; lowat = V_tcp_sendbuf_auto_lowat ? so->so_snd.sb_lowat : 0; if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat - lowat && sbused(&so->so_snd) >= (so->so_snd.sb_hiwat / 8 * 7) - lowat && sbused(&so->so_snd) < V_tcp_autosndbuf_max && sendwin >= (sbused(&so->so_snd) - (tp->snd_nxt - tp->snd_una))) { if (!sbreserve_locked(so, SO_SND, min(so->so_snd.sb_hiwat + V_tcp_autosndbuf_inc, V_tcp_autosndbuf_max), curthread)) so->so_snd.sb_flags &= ~SB_AUTOSIZE; } } } diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index 6d60c21e64e9..01dd7198511c 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -1,1591 +1,1592 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _NETINET_TCP_VAR_H_ #define _NETINET_TCP_VAR_H_ #include #include #ifdef _KERNEL #include #include #include #endif #define TCP_END_BYTE_INFO 8 /* Bytes that makeup the "end information array" */ /* Types of ending byte info */ #define TCP_EI_EMPTY_SLOT 0 #define TCP_EI_STATUS_CLIENT_FIN 0x1 #define TCP_EI_STATUS_CLIENT_RST 0x2 #define TCP_EI_STATUS_SERVER_FIN 0x3 #define TCP_EI_STATUS_SERVER_RST 0x4 #define TCP_EI_STATUS_RETRAN 0x5 #define TCP_EI_STATUS_PROGRESS 0x6 #define TCP_EI_STATUS_PERSIST_MAX 0x7 #define TCP_EI_STATUS_KEEP_MAX 0x8 #define TCP_EI_STATUS_DATA_A_CLOSE 0x9 #define TCP_EI_STATUS_RST_IN_FRONT 0xa #define TCP_EI_STATUS_2MSL 0xb #define TCP_EI_STATUS_MAX_VALUE 0xb #define TCP_TRK_REQ_LOG_NEW 0x01 #define TCP_TRK_REQ_LOG_COMPLETE 0x02 #define TCP_TRK_REQ_LOG_FREED 0x03 #define TCP_TRK_REQ_LOG_ALLOCFAIL 0x04 #define TCP_TRK_REQ_LOG_MOREYET 0x05 #define TCP_TRK_REQ_LOG_FORCEFREE 0x06 #define TCP_TRK_REQ_LOG_STALE 0x07 #define TCP_TRK_REQ_LOG_SEARCH 0x08 /************************************************/ /* Status bits we track to assure no duplicates, * the bits here are not used by the code but * for human representation. To check a bit we * take and shift over by 1 minus the value (1-8). */ /************************************************/ #define TCP_EI_BITS_CLIENT_FIN 0x001 #define TCP_EI_BITS_CLIENT_RST 0x002 #define TCP_EI_BITS_SERVER_FIN 0x004 #define TCP_EI_BITS_SERVER_RST 0x008 #define TCP_EI_BITS_RETRAN 0x010 #define TCP_EI_BITS_PROGRESS 0x020 #define TCP_EI_BITS_PRESIST_MAX 0x040 #define TCP_EI_BITS_KEEP_MAX 0x080 #define TCP_EI_BITS_DATA_A_CLO 0x100 #define TCP_EI_BITS_RST_IN_FR 0x200 /* a front state reset */ #define TCP_EI_BITS_2MS_TIMER 0x400 /* 2 MSL timer expired */ #if defined(_KERNEL) || defined(_WANT_TCPCB) #include #include #include /* TCP segment queue entry */ struct tseg_qent { TAILQ_ENTRY(tseg_qent) tqe_q; struct mbuf *tqe_m; /* mbuf contains packet */ struct mbuf *tqe_last; /* last mbuf in chain */ tcp_seq tqe_start; /* TCP Sequence number start */ int tqe_len; /* TCP segment data length */ uint32_t tqe_flags; /* The flags from tcp_get_flags() */ uint32_t tqe_mbuf_cnt; /* Count of mbuf overhead */ }; TAILQ_HEAD(tsegqe_head, tseg_qent); struct sackblk { tcp_seq start; /* start seq no. of sack block */ tcp_seq end; /* end seq no. */ }; struct sackhole { tcp_seq start; /* start seq no. of hole */ tcp_seq end; /* end seq no. */ tcp_seq rxmit; /* next seq. no in hole to be retransmitted */ TAILQ_ENTRY(sackhole) scblink; /* scoreboard linkage */ }; struct sackhint { struct sackhole *nexthole; int32_t sack_bytes_rexmit; tcp_seq last_sack_ack; /* Most recent/largest sacked ack */ int32_t delivered_data; /* Newly acked data from last SACK */ int32_t sacked_bytes; /* Total sacked bytes reported by the * receiver via sack option */ uint32_t recover_fs; /* Flight Size at the start of Loss recovery */ uint32_t prr_delivered; /* Total bytes delivered using PRR */ uint32_t prr_out; /* Bytes sent during IN_RECOVERY */ int32_t hole_bytes; /* current number of bytes in scoreboard holes */ int32_t lost_bytes; /* number of rfc6675 IsLost() bytes */ }; #define SEGQ_EMPTY(tp) TAILQ_EMPTY(&(tp)->t_segq) STAILQ_HEAD(tcp_log_stailq, tcp_log_mem); #define TCP_TRK_TRACK_FLG_EMPTY 0x00 /* Available */ #define TCP_TRK_TRACK_FLG_USED 0x01 /* In use */ #define TCP_TRK_TRACK_FLG_OPEN 0x02 /* End is not valid (open range request) */ #define TCP_TRK_TRACK_FLG_SEQV 0x04 /* We had a sendfile that touched it */ #define TCP_TRK_TRACK_FLG_COMP 0x08 /* Sendfile as placed the last bits (range req only) */ #define TCP_TRK_TRACK_FLG_FSND 0x10 /* First send has been done into the seq space */ #define TCP_TRK_TRACK_FLG_LSND 0x20 /* We were able to set the Last Sent */ #define MAX_TCP_TRK_REQ 5 /* Max we will have at once */ struct tcp_sendfile_track { uint64_t timestamp; /* User sent timestamp */ uint64_t start; /* Start of sendfile offset */ uint64_t end; /* End if not open-range req */ uint64_t localtime; /* Time we actually got the req */ uint64_t deadline; /* If in CU mode, deadline to delivery */ uint64_t first_send; /* Time of first send in the range */ uint64_t cspr; /* Client suggested pace rate */ uint64_t sent_at_fs; /* What was t_sndbytes as we begun sending */ uint64_t rxt_at_fs; /* What was t_snd_rxt_bytes as we begun sending */ uint64_t sent_at_ls; /* Sent value at the last send */ uint64_t rxt_at_ls; /* Retransmit value at the last send */ tcp_seq start_seq; /* First TCP Seq assigned */ tcp_seq end_seq; /* If range req last seq */ uint32_t flags; /* Type of request open etc */ uint32_t sbcc_at_s; /* When we allocate what is the sb_cc */ uint32_t hint_maxseg; /* Client hinted maxseg */ uint32_t playout_ms; /* Client playout ms */ uint32_t hybrid_flags; /* Hybrid flags on this request */ }; /* * Change Query responses for a stack switch we create a structure * that allows query response from the new stack to the old, if * supported. * * There are three queries currently defined. * - sendmap * - timers * - rack_times * * For the sendmap query the caller fills in the * req and the req_param as the first seq (usually * snd_una). When the response comes back indicating * that there was data (return value 1), then the caller * can build a sendmap entry based on the range and the * times. The next query would then be done at the * newly created sendmap_end. Repeated until sendmap_end == snd_max. * * Flags in sendmap_flags are defined below as well. * * For timers the standard PACE_TMR_XXXX flags are returned indicating * a pacing timer (possibly) and one other timer. If pacing timer then * the expiration timeout time in microseconds is in timer_pacing_to. * And the value used with whatever timer (if a flag is set) is in * timer_rxt. If no timers are running a 0 is returned and of * course no flags are set in timer_hpts_flags. * * The rack_times are a misc collection of information that * the old stack might possibly fill in. Of course its possible * that an old stack may not have a piece of information. If so * then setting that value to zero is advised. Setting any * timestamp passed should only place a zero in it when it * is unfilled. This may mean that a time is off by a micro-second * but this is ok in the grand scheme of things. * * When switching stacks it is desireable to get as much information * from the old stack to the new stack as possible. Though not always * will the stack be compatible in the types of information. The * init() function needs to take care when it begins changing * things such as inp_flags2 and the timer units to position these * changes at a point where it is unlikely they will fail after * making such changes. A stack optionally can have an "undo" * function * * To transfer information to the old stack from the new in * respect to LRO and the inp_flags2, the new stack should set * the inp_flags2 to what it supports. The old stack in its * fini() function should call the tcp_handle_orphaned_packets() * to clean up any packets. Note that a new stack should attempt */ /* Query types */ #define TCP_QUERY_SENDMAP 1 #define TCP_QUERY_TIMERS_UP 2 #define TCP_QUERY_RACK_TIMES 3 /* Flags returned in sendmap_flags */ #define SNDMAP_ACKED 0x000001/* The remote endpoint acked this */ #define SNDMAP_OVERMAX 0x000008/* We have more retran's then we can fit */ #define SNDMAP_SACK_PASSED 0x000010/* A sack was done above this block */ #define SNDMAP_HAS_FIN 0x000040/* segment is sent with fin */ #define SNDMAP_TLP 0x000080/* segment sent as tail-loss-probe */ #define SNDMAP_HAS_SYN 0x000800/* SYN is on this guy */ #define SNDMAP_HAD_PUSH 0x008000/* Push was sent on original send */ #define SNDMAP_MASK (SNDMAP_ACKED|SNDMAP_OVERMAX|SNDMAP_SACK_PASSED|SNDMAP_HAS_FIN\ |SNDMAP_TLP|SNDMAP_HAS_SYN|SNDMAP_HAD_PUSH) #define SNDMAP_NRTX 3 struct tcp_query_resp { int req; uint32_t req_param; union { struct { tcp_seq sendmap_start; tcp_seq sendmap_end; int sendmap_send_cnt; uint64_t sendmap_time[SNDMAP_NRTX]; uint64_t sendmap_ack_arrival; int sendmap_flags; uint32_t sendmap_r_rtr_bytes; /* If FAS is available if not 0 */ uint32_t sendmap_fas; uint8_t sendmap_dupacks; }; struct { uint32_t timer_hpts_flags; uint32_t timer_pacing_to; uint32_t timer_timer_exp; }; struct { /* Timestamps and rtt's */ uint32_t rack_reorder_ts; /* Last uscts that reordering was seen */ uint32_t rack_num_dsacks; /* Num of dsacks seen */ uint32_t rack_rxt_last_time; /* Last time a RXT/TLP or rack tmr went off */ uint32_t rack_min_rtt; /* never 0 smallest rtt seen */ uint32_t rack_rtt; /* Last rtt used by rack */ uint32_t rack_tmit_time; /* The time the rtt seg was tmited */ uint32_t rack_time_went_idle; /* If in persist the time we went idle */ /* Prr data */ uint32_t rack_sacked; uint32_t rack_holes_rxt; uint32_t rack_prr_delivered; uint32_t rack_prr_recovery_fs; uint32_t rack_prr_out; uint32_t rack_prr_sndcnt; /* TLP data */ uint16_t rack_tlp_cnt_out; /* How many tlp's have been sent */ /* Various bits */ uint8_t rack_tlp_out; /* Is a TLP outstanding */ uint8_t rack_srtt_measured; /* The previous stack has measured srtt */ uint8_t rack_in_persist; /* Is the old stack in persists? */ uint8_t rack_wanted_output; /* Did the prevous stack have a want output set */ }; }; }; #define TCP_TMR_GRANULARITY_TICKS 1 /* TCP timers are in ticks (msec if hz=1000) */ #define TCP_TMR_GRANULARITY_USEC 2 /* TCP timers are in microseconds */ typedef enum { TT_REXMT = 0, TT_PERSIST, TT_KEEP, TT_2MSL, TT_DELACK, TT_N, } tt_which; typedef enum { TT_PROCESSING = 0, TT_PROCESSED, TT_STARTING, TT_STOPPING, } tt_what; /* * Tcp control block, one per tcp connection. */ struct tcpcb { struct inpcb t_inpcb; /* embedded protocol independent cb */ #define t_start_zero t_fb #define t_zero_size (sizeof(struct tcpcb) - \ offsetof(struct tcpcb, t_start_zero)) struct tcp_function_block *t_fb;/* TCP function call block */ void *t_fb_ptr; /* Pointer to t_fb specific data */ struct callout t_callout; sbintime_t t_timers[TT_N]; sbintime_t t_precisions[TT_N]; /* HPTS. Used by BBR and Rack stacks. See tcp_hpts.c for more info. */ TAILQ_ENTRY(tcpcb) t_hpts; /* linkage to HPTS ring */ STAILQ_HEAD(, mbuf) t_inqueue; /* HPTS input packets queue */ uint32_t t_hpts_request; /* Current hpts request, zero if * fits in the pacing window. */ uint32_t t_hpts_slot; /* HPTS wheel slot this tcb is. */ uint32_t t_hpts_drop_reas; /* Reason we are dropping the pcb. */ uint32_t t_hpts_gencnt; uint16_t t_hpts_cpu; /* CPU chosen by hpts_cpuid(). */ uint16_t t_lro_cpu; /* CPU derived from LRO. */ #define HPTS_CPU_NONE ((uint16_t)-1) enum { IHPTS_NONE = 0, IHPTS_ONQUEUE, IHPTS_MOVING, } t_in_hpts; /* Is it linked into HPTS? */ uint32_t t_maxseg:24, /* maximum segment size */ _t_logstate:8; /* State of "black box" logging */ uint32_t t_port:16, /* Tunneling (over udp) port */ t_state:4, /* state of this connection */ t_idle_reduce : 1, t_delayed_ack: 7, /* Delayed ack variable */ t_fin_is_rst: 1, /* Are fin's treated as resets */ t_log_state_set: 1, bits_spare : 2; u_int t_flags; tcp_seq snd_una; /* sent but unacknowledged */ tcp_seq snd_max; /* highest sequence number sent; * used to recognize retransmits */ tcp_seq snd_nxt; /* send next */ tcp_seq snd_up; /* send urgent pointer */ uint32_t snd_wnd; /* send window */ uint32_t snd_cwnd; /* congestion-controlled window */ uint32_t ts_offset; /* our timestamp offset */ uint32_t rfbuf_ts; /* recv buffer autoscaling timestamp */ int rcv_numsacks; /* # distinct sack blks present */ u_int t_tsomax; /* TSO total burst length limit */ u_int t_tsomaxsegcount; /* TSO maximum segment count */ u_int t_tsomaxsegsize; /* TSO maximum segment size in bytes */ tcp_seq rcv_nxt; /* receive next */ tcp_seq rcv_adv; /* advertised window */ uint32_t rcv_wnd; /* receive window */ u_int t_flags2; /* More tcpcb flags storage */ int t_srtt; /* smoothed round-trip time */ int t_rttvar; /* variance in round-trip time */ uint32_t ts_recent; /* timestamp echo data */ u_char snd_scale; /* window scaling for send window */ u_char rcv_scale; /* window scaling for recv window */ u_char snd_limited; /* segments limited transmitted */ u_char request_r_scale; /* pending window scaling */ tcp_seq last_ack_sent; u_int t_rcvtime; /* inactivity time */ tcp_seq rcv_up; /* receive urgent pointer */ int t_segqlen; /* segment reassembly queue length */ uint32_t t_segqmbuflen; /* total reassembly queue byte length */ struct tsegqe_head t_segq; /* segment reassembly queue */ uint32_t snd_ssthresh; /* snd_cwnd size threshold for * for slow start exponential to * linear switch */ tcp_seq snd_wl1; /* window update seg seq number */ tcp_seq snd_wl2; /* window update seg ack number */ tcp_seq irs; /* initial receive sequence number */ tcp_seq iss; /* initial send sequence number */ u_int t_acktime; /* RACK and BBR incoming new data was acked */ u_int t_sndtime; /* time last data was sent */ u_int ts_recent_age; /* when last updated */ tcp_seq snd_recover; /* for use in NewReno Fast Recovery */ char t_oobflags; /* have some */ char t_iobc; /* input character */ uint8_t t_nic_ktls_xmit:1, /* active nic ktls xmit sessions */ t_nic_ktls_xmit_dis:1, /* disabled nic xmit ktls? */ t_nic_ktls_spare:6; /* spare nic ktls */ int t_rxtcur; /* current retransmit value (ticks) */ int t_rxtshift; /* log(2) of rexmt exp. backoff */ u_int t_rtttime; /* RTT measurement start time */ tcp_seq t_rtseq; /* sequence number being timed */ u_int t_starttime; /* time connection was established */ u_int t_fbyte_in; /* ticks time first byte queued in */ u_int t_fbyte_out; /* ticks time first byte queued out */ u_int t_pmtud_saved_maxseg; /* pre-blackhole MSS */ int t_blackhole_enter; /* when to enter blackhole detection */ int t_blackhole_exit; /* when to exit blackhole detection */ u_int t_rttmin; /* minimum rtt allowed */ int t_softerror; /* possible error not yet reported */ uint32_t max_sndwnd; /* largest window peer has offered */ uint32_t snd_cwnd_prev; /* cwnd prior to retransmit */ uint32_t snd_ssthresh_prev; /* ssthresh prior to retransmit */ tcp_seq snd_recover_prev; /* snd_recover prior to retransmit */ int t_sndzerowin; /* zero-window updates sent */ int snd_numholes; /* number of holes seen by sender */ u_int t_badrxtwin; /* window for retransmit recovery */ TAILQ_HEAD(sackhole_head, sackhole) snd_holes; /* SACK scoreboard (sorted) */ tcp_seq snd_fack; /* last seq number(+1) sack'd by rcv'r*/ struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */ struct sackhint sackhint; /* SACK scoreboard hint */ int t_rttlow; /* smallest observerved RTT */ int rfbuf_cnt; /* recv buffer autoscaling byte count */ struct toedev *tod; /* toedev handling this connection */ int t_sndrexmitpack; /* retransmit packets sent */ int t_rcvoopack; /* out-of-order packets received */ void *t_toe; /* TOE pcb pointer */ struct cc_algo *t_cc; /* congestion control algorithm */ struct cc_var t_ccv; /* congestion control specific vars */ int t_bytes_acked; /* # bytes acked during current RTT */ u_int t_maxunacktime; u_int t_keepinit; /* time to establish connection */ u_int t_keepidle; /* time before keepalive probes begin */ u_int t_keepintvl; /* interval between keepalives */ u_int t_keepcnt; /* number of keepalives before close */ int t_dupacks; /* consecutive dup acks recd */ int t_lognum; /* Number of log entries */ int t_loglimit; /* Maximum number of log entries */ uint32_t t_rcep; /* Number of received CE marked pkts */ uint32_t t_scep; /* Synced number of delivered CE pkts */ int64_t t_pacing_rate; /* bytes / sec, -1 => unlimited */ struct tcp_log_stailq t_logs; /* Log buffer */ struct tcp_log_id_node *t_lin; struct tcp_log_id_bucket *t_lib; const char *t_output_caller; /* Function that called tcp_output */ struct statsblob *t_stats; /* Per-connection stats */ /* Should these be a pointer to the arrays or an array? */ uint32_t t_logsn; /* Log "serial number" */ uint32_t gput_ts; /* Time goodput measurement started */ tcp_seq gput_seq; /* Outbound measurement seq */ tcp_seq gput_ack; /* Inbound measurement ack */ int32_t t_stats_gput_prev; /* XXXLAS: Prev gput measurement */ uint32_t t_maxpeakrate; /* max peak rate set by user, bytes/s */ uint32_t t_sndtlppack; /* tail loss probe packets sent */ uint64_t t_sndtlpbyte; /* total tail loss probe bytes sent */ uint64_t t_sndbytes; /* total bytes sent */ uint64_t t_snd_rxt_bytes; /* total bytes retransmitted */ uint32_t t_dsack_bytes; /* dsack bytes received */ uint32_t t_dsack_tlp_bytes; /* dsack bytes received for TLPs sent */ uint32_t t_dsack_pack; /* dsack packets we have eceived */ uint8_t t_tmr_granularity; /* Granularity of all timers srtt etc */ uint8_t t_rttupdated; /* number of times rtt sampled */ /* TCP Fast Open */ uint8_t t_tfo_client_cookie_len; /* TFO client cookie length */ uint32_t t_end_info_status; /* Status flag of end info */ unsigned int *t_tfo_pending; /* TFO server pending counter */ union { uint8_t client[TCP_FASTOPEN_MAX_COOKIE_LEN]; uint64_t server; } t_tfo_cookie; /* TCP Fast Open cookie to send */ union { uint8_t t_end_info_bytes[TCP_END_BYTE_INFO]; uint64_t t_end_info; }; struct osd t_osd; /* storage for Khelp module data */ uint8_t _t_logpoint; /* Used when a BB log points is enabled */ /* * Keep all #ifdef'ed components at the end of the structure! * This is important to minimize problems when compiling modules * using this structure from within the modules' directory. */ #ifdef TCP_REQUEST_TRK /* Response tracking addons. */ uint8_t t_tcpreq_req; /* Request count */ uint8_t t_tcpreq_open; /* Number of open range requests */ uint8_t t_tcpreq_closed; /* Number of closed range requests */ uint32_t tcp_hybrid_start; /* Num of times we started hybrid pacing */ uint32_t tcp_hybrid_stop; /* Num of times we stopped hybrid pacing */ uint32_t tcp_hybrid_error; /* Num of times we failed to start hybrid pacing */ struct tcp_sendfile_track t_tcpreq_info[MAX_TCP_TRK_REQ]; #endif #ifdef TCP_ACCOUNTING uint64_t tcp_cnt_counters[TCP_NUM_CNT_COUNTERS]; uint64_t tcp_proc_time[TCP_NUM_CNT_COUNTERS]; #endif #ifdef TCPPCAP struct mbufq t_inpkts; /* List of saved input packets. */ struct mbufq t_outpkts; /* List of saved output packets. */ #endif }; #endif /* _KERNEL || _WANT_TCPCB */ #ifdef _KERNEL struct tcptemp { u_char tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */ struct tcphdr tt_t; }; /* SACK scoreboard update status */ typedef enum { SACK_NOCHANGE = 0, SACK_CHANGE, SACK_NEWLOSS } sackstatus_t; /* Enable TCP/UDP tunneling port */ #define TCP_TUNNELING_PORT_MIN 0 #define TCP_TUNNELING_PORT_MAX 65535 #define TCP_TUNNELING_PORT_DEFAULT 0 /* Enable TCP/UDP tunneling port */ #define TCP_TUNNELING_OVERHEAD_MIN sizeof(struct udphdr) #define TCP_TUNNELING_OVERHEAD_MAX 1024 #define TCP_TUNNELING_OVERHEAD_DEFAULT TCP_TUNNELING_OVERHEAD_MIN /* Minimum map entries limit value, if set */ #define TCP_MIN_MAP_ENTRIES_LIMIT 128 /* * TODO: We yet need to brave plowing in * to tcp_input() and the pru_usrreq() block. * Right now these go to the old standards which * are somewhat ok, but in the long term may * need to be changed. If we do tackle tcp_input() * then we need to get rid of the tcp_do_segment() * function below. */ /* Flags for tcp functions */ #define TCP_FUNC_BEING_REMOVED 0x01 /* Can no longer be referenced */ #define TCP_FUNC_OUTPUT_CANDROP 0x02 /* tfb_tcp_output may ask tcp_drop */ /** * Adding a tfb_tcp_handoff_ok function allows the socket * option to change stacks to query you even if the * connection is in a later stage. You return 0 to * say you can take over and run your stack, you return * non-zero (an error number) to say no you can't. * If the function is undefined you can only change * in the early states (before connect or listen). * * tfb_tcp_fb_init is used to allow the new stack to * setup its control block. Among the things it must * do is: * a) Make sure that the inp_flags2 is setup correctly * for LRO. There are two flags that the previous * stack may have set INP_MBUF_ACKCMP and * INP_SUPPORTS_MBUFQ. If the new stack does not * support these it *should* clear the flags. * b) Make sure that the timers are in the proper * granularity that the stack wants. The stack * should check the t_tmr_granularity field. Currently * there are two values that it may hold * TCP_TMR_GRANULARITY_TICKS and TCP_TMR_GRANULARITY_USEC. * Use the functions tcp_timer_convert(tp, granularity); * to move the timers to the correct format for your stack. * * The new stack may also optionally query the tfb_chg_query * function if the old stack has one. The new stack may ask * for one of three entries and can also state to the old * stack its support for the INP_MBUF_ACKCMP and * INP_SUPPORTS_MBUFQ. This is important since if there are * queued ack's without that statement the old stack will * be forced to discard the queued acks. The requests that * can be made for information by the new stacks are: * * Note also that the tfb_tcp_fb_init() when called can * determine if a query is needed by looking at the * value passed in the ptr. The ptr is designed to be * set in with any allocated memory, but the address * of the condtion (ptr == &tp->t_fb_ptr) will be * true if this is not a stack switch but the initial * setup of a tcb (which means no query would be needed). * If, however, the value is not t_fb_ptr, then the caller * is in the middle of a stack switch and is the new stack. * A query would be appropriate (if the new stack support * the query mechanism). * * TCP_QUERY_SENDMAP - Query of outstanding data. * TCP_QUERY_TIMERS_UP - Query about running timers. * TCP_SUPPORTED_LRO - Declaration in req_param of * the inp_flags2 supported by * the new stack. * TCP_QUERY_RACK_TIMES - Enquire about various timestamps * and states the old stack may be in. * * tfb_tcp_fb_fini is changed to add a flag to tell * the old stack if the tcb is being destroyed or * not. A one in the flag means the TCB is being * destroyed, a zero indicates its transitioning to * another stack (via socket option). The * tfb_tcp_fb_fini() function itself should not change timers * or inp_flags2 (the tfb_tcp_fb_init() must do that). However * if the old stack supports the LRO mbuf queuing, and the new * stack does not communicate via chg messages that it too does, * it must assume it does not and free any queued mbufs. * */ struct tcp_function_block { char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX]; int (*tfb_tcp_output)(struct tcpcb *); void (*tfb_tcp_do_segment)(struct tcpcb *, struct mbuf *, struct tcphdr *, int, int, uint8_t); int (*tfb_do_segment_nounlock)(struct tcpcb *, struct mbuf *, struct tcphdr *, int, int, uint8_t, int, struct timeval *); int (*tfb_do_queued_segments)(struct tcpcb *, int); int (*tfb_tcp_ctloutput)(struct tcpcb *, struct sockopt *); /* Optional memory allocation/free routine */ int (*tfb_tcp_fb_init)(struct tcpcb *, void **); void (*tfb_tcp_fb_fini)(struct tcpcb *, int); /* Optional timers, must define all if you define one */ int (*tfb_tcp_timer_stop_all)(struct tcpcb *); void (*tfb_tcp_rexmit_tmr)(struct tcpcb *); int (*tfb_tcp_handoff_ok)(struct tcpcb *); void (*tfb_tcp_mtu_chg)(struct tcpcb *tp); int (*tfb_pru_options)(struct tcpcb *, int); void (*tfb_hwtls_change)(struct tcpcb *, int); int (*tfb_chg_query)(struct tcpcb *, struct tcp_query_resp *); void (*tfb_switch_failed)(struct tcpcb *); bool (*tfb_early_wake_check)(struct tcpcb *); int (*tfb_compute_pipe)(struct tcpcb *tp); int (*tfb_stack_info)(struct tcpcb *tp, struct stack_specific_info *); void (*tfb_inherit)(struct tcpcb *tp, struct inpcb *h_inp); volatile uint32_t tfb_refcnt; uint32_t tfb_flags; uint8_t tfb_id; }; struct tcp_function { TAILQ_ENTRY(tcp_function) tf_next; char tf_name[TCP_FUNCTION_NAME_LEN_MAX]; struct tcp_function_block *tf_fb; }; TAILQ_HEAD(tcp_funchead, tcp_function); struct tcpcb * tcp_drop(struct tcpcb *, int); #ifdef _NETINET_IN_PCB_H_ #define intotcpcb(inp) __containerof((inp), struct tcpcb, t_inpcb) #define sototcpcb(so) intotcpcb(sotoinpcb(so)) #define tptoinpcb(tp) (&(tp)->t_inpcb) #define tptosocket(tp) (tp)->t_inpcb.inp_socket /* * tcp_output() * Handles tcp_drop request from advanced stacks and reports that inpcb is * gone with negative return code. * Drop in replacement for the default stack. */ static inline int tcp_output(struct tcpcb *tp) { struct inpcb *inp = tptoinpcb(tp); int rv; INP_WLOCK_ASSERT(inp); rv = tp->t_fb->tfb_tcp_output(tp); if (rv < 0) { KASSERT(tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP, ("TCP stack %s requested tcp_drop(%p)", tp->t_fb->tfb_tcp_block_name, tp)); tp = tcp_drop(tp, -rv); if (tp) INP_WUNLOCK(inp); } return (rv); } /* * tcp_output_unlock() * Always returns unlocked, handles drop request from advanced stacks. * Always returns positive error code. */ static inline int tcp_output_unlock(struct tcpcb *tp) { struct inpcb *inp = tptoinpcb(tp); int rv; INP_WLOCK_ASSERT(inp); rv = tp->t_fb->tfb_tcp_output(tp); if (rv < 0) { KASSERT(tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP, ("TCP stack %s requested tcp_drop(%p)", tp->t_fb->tfb_tcp_block_name, tp)); rv = -rv; tp = tcp_drop(tp, rv); if (tp) INP_WUNLOCK(inp); } else INP_WUNLOCK(inp); return (rv); } /* * tcp_output_nodrop() * Always returns locked. It is caller's responsibility to run tcp_drop()! * Useful in syscall implementations, when we want to perform some logging * and/or tracing with tcpcb before calling tcp_drop(). To be used with * tcp_unlock_or_drop() later. * * XXXGL: maybe don't allow stacks to return a drop request at certain * TCP states? Why would it do in connect(2)? In recv(2)? */ static inline int tcp_output_nodrop(struct tcpcb *tp) { int rv; INP_WLOCK_ASSERT(tptoinpcb(tp)); rv = tp->t_fb->tfb_tcp_output(tp); KASSERT(rv >= 0 || tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP, ("TCP stack %s requested tcp_drop(%p)", tp->t_fb->tfb_tcp_block_name, tp)); return (rv); } /* * tcp_unlock_or_drop() * Handle return code from tfb_tcp_output() after we have logged/traced, * to be used with tcp_output_nodrop(). */ static inline int tcp_unlock_or_drop(struct tcpcb *tp, int tcp_output_retval) { struct inpcb *inp = tptoinpcb(tp); INP_WLOCK_ASSERT(inp); if (tcp_output_retval < 0) { tcp_output_retval = -tcp_output_retval; if (tcp_drop(tp, tcp_output_retval) != NULL) INP_WUNLOCK(inp); } else INP_WUNLOCK(inp); return (tcp_output_retval); } #endif /* _NETINET_IN_PCB_H_ */ static int inline tcp_packets_this_ack(struct tcpcb *tp, tcp_seq ack) { return ((ack - tp->snd_una) / tp->t_maxseg + ((((ack - tp->snd_una) % tp->t_maxseg) != 0) ? 1 : 0)); } #endif /* _KERNEL */ /* * Flags and utility macros for the t_flags field. */ #define TF_ACKNOW 0x00000001 /* ack peer immediately */ #define TF_DELACK 0x00000002 /* ack, but try to delay it */ #define TF_NODELAY 0x00000004 /* don't delay packets to coalesce */ #define TF_NOOPT 0x00000008 /* don't use tcp options */ #define TF_SENTFIN 0x00000010 /* have sent FIN */ #define TF_REQ_SCALE 0x00000020 /* have/will request window scaling */ #define TF_RCVD_SCALE 0x00000040 /* other side has requested scaling */ #define TF_REQ_TSTMP 0x00000080 /* have/will request timestamps */ #define TF_RCVD_TSTMP 0x00000100 /* a timestamp was received in SYN */ #define TF_SACK_PERMIT 0x00000200 /* other side said I could SACK */ #define TF_NEEDSYN 0x00000400 /* send SYN (implicit state) */ #define TF_NEEDFIN 0x00000800 /* send FIN (implicit state) */ #define TF_NOPUSH 0x00001000 /* don't push */ #define TF_PREVVALID 0x00002000 /* saved values for bad rxmit valid * Note: accessing and restoring from * these may only be done in the 1st * RTO recovery round (t_rxtshift == 1) */ #define TF_WAKESOR 0x00004000 /* wake up receive socket */ #define TF_GPUTINPROG 0x00008000 /* Goodput measurement in progress */ #define TF_MORETOCOME 0x00010000 /* More data to be appended to sock */ #define TF_SONOTCONN 0x00020000 /* needs soisconnected() on ESTAB */ #define TF_LASTIDLE 0x00040000 /* connection was previously idle */ #define TF_RXWIN0SENT 0x00080000 /* sent a receiver win 0 in response */ #define TF_FASTRECOVERY 0x00100000 /* in NewReno Fast Recovery */ #define TF_WASFRECOVERY 0x00200000 /* was in NewReno Fast Recovery */ #define TF_SIGNATURE 0x00400000 /* require MD5 digests (RFC2385) */ #define TF_FORCEDATA 0x00800000 /* force out a byte */ #define TF_TSO 0x01000000 /* TSO enabled on this connection */ #define TF_TOE 0x02000000 /* this connection is offloaded */ #define TF_CLOSED 0x04000000 /* close(2) called on socket */ #define TF_SENTSYN 0x08000000 /* At least one syn has been sent */ #define TF_LRD 0x10000000 /* Lost Retransmission Detection */ #define TF_CONGRECOVERY 0x20000000 /* congestion recovery mode */ #define TF_WASCRECOVERY 0x40000000 /* was in congestion recovery */ #define TF_FASTOPEN 0x80000000 /* TCP Fast Open indication */ #define IN_FASTRECOVERY(t_flags) (t_flags & TF_FASTRECOVERY) #define ENTER_FASTRECOVERY(t_flags) t_flags |= TF_FASTRECOVERY #define EXIT_FASTRECOVERY(t_flags) t_flags &= ~TF_FASTRECOVERY #define IN_CONGRECOVERY(t_flags) (t_flags & TF_CONGRECOVERY) #define ENTER_CONGRECOVERY(t_flags) t_flags |= TF_CONGRECOVERY #define EXIT_CONGRECOVERY(t_flags) t_flags &= ~TF_CONGRECOVERY #define IN_RECOVERY(t_flags) (t_flags & (TF_CONGRECOVERY | TF_FASTRECOVERY)) #define ENTER_RECOVERY(t_flags) t_flags |= (TF_CONGRECOVERY | TF_FASTRECOVERY) #define EXIT_RECOVERY(t_flags) t_flags &= ~(TF_CONGRECOVERY | TF_FASTRECOVERY) #define BYTES_THIS_ACK(tp, th) (th->th_ack - tp->snd_una) /* * Flags for the t_oobflags field. */ #define TCPOOB_HAVEDATA 0x01 #define TCPOOB_HADDATA 0x02 /* * Flags for the extended TCP flags field, t_flags2 */ #define TF2_PLPMTU_BLACKHOLE 0x00000001 /* Possible PLPMTUD Black Hole. */ #define TF2_PLPMTU_PMTUD 0x00000002 /* Allowed to attempt PLPMTUD. */ #define TF2_PLPMTU_MAXSEGSNT 0x00000004 /* Last seg sent was full seg. */ #define TF2_LOG_AUTO 0x00000008 /* Session is auto-logging. */ #define TF2_DROP_AF_DATA 0x00000010 /* Drop after all data ack'd */ #define TF2_ECN_PERMIT 0x00000020 /* connection ECN-ready */ #define TF2_ECN_SND_CWR 0x00000040 /* ECN CWR in queue */ #define TF2_ECN_SND_ECE 0x00000080 /* ECN ECE in queue */ #define TF2_ACE_PERMIT 0x00000100 /* Accurate ECN mode */ #define TF2_HPTS_CPU_SET 0x00000200 /* t_hpts_cpu is not random */ #define TF2_FBYTES_COMPLETE 0x00000400 /* We have first bytes in and out */ #define TF2_ECN_USE_ECT1 0x00000800 /* Use ECT(1) marking on session */ #define TF2_TCP_ACCOUNTING 0x00001000 /* Do TCP accounting */ #define TF2_HPTS_CALLS 0x00002000 /* tcp_output() called via HPTS */ #define TF2_MBUF_L_ACKS 0x00004000 /* large mbufs for ack compression */ #define TF2_MBUF_ACKCMP 0x00008000 /* mbuf ack compression ok */ #define TF2_SUPPORTS_MBUFQ 0x00010000 /* Supports the mbuf queue method */ #define TF2_MBUF_QUEUE_READY 0x00020000 /* Inputs can be queued */ #define TF2_DONT_SACK_QUEUE 0x00040000 /* Don't wake on sack */ #define TF2_CANNOT_DO_ECN 0x00080000 /* The stack does not do ECN */ #define TF2_PROC_SACK_PROHIBIT 0x00100000 /* Due to small MSS size do not process sack's */ /* * Structure to hold TCP options that are only used during segment * processing (in tcp_input), but not held in the tcpcb. * It's basically used to reduce the number of parameters * to tcp_dooptions and tcp_addoptions. * The binary order of the to_flags is relevant for packing of the * options in tcp_addoptions. */ struct tcpopt { u_int32_t to_flags; /* which options are present */ #define TOF_MSS 0x0001 /* maximum segment size */ #define TOF_SCALE 0x0002 /* window scaling */ #define TOF_SACKPERM 0x0004 /* SACK permitted */ #define TOF_TS 0x0010 /* timestamp */ #define TOF_SIGNATURE 0x0040 /* TCP-MD5 signature option (RFC2385) */ #define TOF_SACK 0x0080 /* Peer sent SACK option */ #define TOF_FASTOPEN 0x0100 /* TCP Fast Open (TFO) cookie */ #define TOF_MAXOPT 0x0200 u_int32_t to_tsval; /* new timestamp */ u_int32_t to_tsecr; /* reflected timestamp */ u_char *to_sacks; /* pointer to the first SACK blocks */ u_char *to_signature; /* pointer to the TCP-MD5 signature */ u_int8_t *to_tfo_cookie; /* pointer to the TFO cookie */ u_int16_t to_mss; /* maximum segment size */ u_int8_t to_wscale; /* window scaling */ u_int8_t to_nsacks; /* number of SACK blocks */ u_int8_t to_tfo_len; /* TFO cookie length */ u_int32_t to_spare; /* UTO */ }; /* * Flags for tcp_dooptions. */ #define TO_SYN 0x01 /* parse SYN-only options */ struct hc_metrics_lite { /* must stay in sync with hc_metrics */ uint32_t rmx_mtu; /* MTU for this path */ uint32_t rmx_ssthresh; /* outbound gateway buffer limit */ uint32_t rmx_rtt; /* estimated round trip time */ uint32_t rmx_rttvar; /* estimated rtt variance */ uint32_t rmx_cwnd; /* congestion window */ uint32_t rmx_sendpipe; /* outbound delay-bandwidth product */ uint32_t rmx_recvpipe; /* inbound delay-bandwidth product */ }; #ifndef _NETINET_IN_PCB_H_ struct in_conninfo; #endif /* _NETINET_IN_PCB_H_ */ /* * The smoothed round-trip time and estimated variance * are stored as fixed point numbers scaled by the values below. * For convenience, these scales are also used in smoothing the average * (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed). * With these scales, srtt has 3 bits to the right of the binary point, * and thus an "ALPHA" of 0.875. rttvar has 2 bits to the right of the * binary point, and is smoothed with an ALPHA of 0.75. */ #define TCP_RTT_SCALE 32 /* multiplier for srtt; 5 bits frac. */ #define TCP_RTT_SHIFT 5 /* shift for srtt; 5 bits frac. */ #define TCP_RTTVAR_SCALE 16 /* multiplier for rttvar; 4 bits */ #define TCP_RTTVAR_SHIFT 4 /* shift for rttvar; 4 bits */ #define TCP_DELTA_SHIFT 2 /* see tcp_input.c */ /* * The initial retransmission should happen at rtt + 4 * rttvar. * Because of the way we do the smoothing, srtt and rttvar * will each average +1/2 tick of bias. When we compute * the retransmit timer, we want 1/2 tick of rounding and * 1 extra tick because of +-1/2 tick uncertainty in the * firing of the timer. The bias will give us exactly the * 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below * the minimum feasible timer (which is 2 ticks). * This version of the macro adapted from a paper by Lawrence * Brakmo and Larry Peterson which outlines a problem caused * by insufficient precision in the original implementation, * which results in inappropriately large RTO values for very * fast networks. */ #define TCP_REXMTVAL(tp) \ max((tp)->t_rttmin, (((tp)->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)) \ + (tp)->t_rttvar) >> TCP_DELTA_SHIFT) /* * TCP statistics. * Many of these should be kept per connection, * but that's inconvenient at the moment. */ struct tcpstat { uint64_t tcps_connattempt; /* connections initiated */ uint64_t tcps_accepts; /* connections accepted */ uint64_t tcps_connects; /* connections established */ uint64_t tcps_drops; /* connections dropped */ uint64_t tcps_conndrops; /* embryonic connections dropped */ uint64_t tcps_minmssdrops; /* average minmss too low drops */ uint64_t tcps_closed; /* conn. closed (includes drops) */ uint64_t tcps_segstimed; /* segs where we tried to get rtt */ uint64_t tcps_rttupdated; /* times we succeeded */ uint64_t tcps_delack; /* delayed acks sent */ uint64_t tcps_timeoutdrop; /* conn. dropped in rxmt timeout */ uint64_t tcps_rexmttimeo; /* retransmit timeouts */ uint64_t tcps_persisttimeo; /* persist timeouts */ uint64_t tcps_keeptimeo; /* keepalive timeouts */ uint64_t tcps_keepprobe; /* keepalive probes sent */ uint64_t tcps_keepdrops; /* connections dropped in keepalive */ uint64_t tcps_progdrops; /* drops due to no progress */ uint64_t tcps_sndtotal; /* total packets sent */ uint64_t tcps_sndpack; /* data packets sent */ uint64_t tcps_sndbyte; /* data bytes sent */ uint64_t tcps_sndrexmitpack; /* data packets retransmitted */ uint64_t tcps_sndrexmitbyte; /* data bytes retransmitted */ uint64_t tcps_sndrexmitbad; /* unnecessary packet retransmissions */ uint64_t tcps_sndacks; /* ack-only packets sent */ uint64_t tcps_sndprobe; /* window probes sent */ uint64_t tcps_sndurg; /* packets sent with URG only */ uint64_t tcps_sndwinup; /* window update-only packets sent */ uint64_t tcps_sndctrl; /* control (SYN|FIN|RST) packets sent */ uint64_t tcps_rcvtotal; /* total packets received */ uint64_t tcps_rcvpack; /* packets received in sequence */ uint64_t tcps_rcvbyte; /* bytes received in sequence */ uint64_t tcps_rcvbadsum; /* packets received with ccksum errs */ uint64_t tcps_rcvbadoff; /* packets received with bad offset */ uint64_t tcps_rcvreassfull; /* packets dropped for no reass space */ uint64_t tcps_rcvshort; /* packets received too short */ uint64_t tcps_rcvduppack; /* duplicate-only packets received */ uint64_t tcps_rcvdupbyte; /* duplicate-only bytes received */ uint64_t tcps_rcvpartduppack; /* packets with some duplicate data */ uint64_t tcps_rcvpartdupbyte; /* dup. bytes in part-dup. packets */ uint64_t tcps_rcvoopack; /* out-of-order packets received */ uint64_t tcps_rcvoobyte; /* out-of-order bytes received */ uint64_t tcps_rcvpackafterwin; /* packets with data after window */ uint64_t tcps_rcvbyteafterwin; /* bytes rcvd after window */ uint64_t tcps_rcvafterclose; /* packets rcvd after "close" */ uint64_t tcps_rcvwinprobe; /* rcvd window probe packets */ uint64_t tcps_rcvdupack; /* rcvd duplicate acks */ uint64_t tcps_rcvacktoomuch; /* rcvd acks for unsent data */ uint64_t tcps_rcvackpack; /* rcvd ack packets */ uint64_t tcps_rcvackbyte; /* bytes acked by rcvd acks */ uint64_t tcps_rcvwinupd; /* rcvd window update packets */ uint64_t tcps_pawsdrop; /* segments dropped due to PAWS */ uint64_t tcps_predack; /* times hdr predict ok for acks */ uint64_t tcps_preddat; /* times hdr predict ok for data pkts */ uint64_t tcps_pcbcachemiss; uint64_t tcps_cachedrtt; /* times cached RTT in route updated */ uint64_t tcps_cachedrttvar; /* times cached rttvar updated */ uint64_t tcps_cachedssthresh; /* times cached ssthresh updated */ uint64_t tcps_usedrtt; /* times RTT initialized from route */ uint64_t tcps_usedrttvar; /* times RTTVAR initialized from rt */ uint64_t tcps_usedssthresh; /* times ssthresh initialized from rt*/ uint64_t tcps_persistdrop; /* timeout in persist state */ uint64_t tcps_badsyn; /* bogus SYN, e.g. premature ACK */ uint64_t tcps_mturesent; /* resends due to MTU discovery */ uint64_t tcps_listendrop; /* listen queue overflows */ uint64_t tcps_badrst; /* ignored RSTs in the window */ uint64_t tcps_sc_added; /* entry added to syncache */ uint64_t tcps_sc_retransmitted; /* syncache entry was retransmitted */ uint64_t tcps_sc_dupsyn; /* duplicate SYN packet */ uint64_t tcps_sc_dropped; /* could not reply to packet */ uint64_t tcps_sc_completed; /* successful extraction of entry */ uint64_t tcps_sc_bucketoverflow;/* syncache per-bucket limit hit */ uint64_t tcps_sc_cacheoverflow; /* syncache cache limit hit */ uint64_t tcps_sc_reset; /* RST removed entry from syncache */ uint64_t tcps_sc_stale; /* timed out or listen socket gone */ uint64_t tcps_sc_aborted; /* syncache entry aborted */ uint64_t tcps_sc_badack; /* removed due to bad ACK */ uint64_t tcps_sc_unreach; /* ICMP unreachable received */ uint64_t tcps_sc_zonefail; /* zalloc() failed */ uint64_t tcps_sc_sendcookie; /* SYN cookie sent */ uint64_t tcps_sc_recvcookie; /* SYN cookie received */ uint64_t tcps_hc_added; /* entry added to hostcache */ uint64_t tcps_hc_bucketoverflow;/* hostcache per bucket limit hit */ uint64_t tcps_finwait2_drops; /* Drop FIN_WAIT_2 connection after time limit */ /* SACK related stats */ uint64_t tcps_sack_recovery_episode; /* SACK recovery episodes */ uint64_t tcps_sack_rexmits; /* SACK rexmit segments */ + uint64_t tcps_sack_rexmits_tso; /* SACK rexmit TSO chunks */ uint64_t tcps_sack_rexmit_bytes; /* SACK rexmit bytes */ uint64_t tcps_sack_rcv_blocks; /* SACK blocks (options) received */ uint64_t tcps_sack_send_blocks; /* SACK blocks (options) sent */ uint64_t tcps_sack_lostrexmt; /* SACK lost retransmission recovered */ uint64_t tcps_sack_sboverflow; /* times scoreboard overflowed */ /* ECN related stats */ uint64_t tcps_ecn_rcvce; /* ECN Congestion Experienced */ uint64_t tcps_ecn_rcvect0; /* ECN Capable Transport */ uint64_t tcps_ecn_rcvect1; /* ECN Capable Transport */ uint64_t tcps_ecn_shs; /* ECN successful handshakes */ uint64_t tcps_ecn_rcwnd; /* # times ECN reduced the cwnd */ /* TCP_SIGNATURE related stats */ uint64_t tcps_sig_rcvgoodsig; /* Total matching signature received */ uint64_t tcps_sig_rcvbadsig; /* Total bad signature received */ uint64_t tcps_sig_err_buildsig; /* Failed to make signature */ uint64_t tcps_sig_err_sigopt; /* No signature expected by socket */ uint64_t tcps_sig_err_nosigopt; /* No signature provided by segment */ /* Path MTU Discovery Black Hole Detection related stats */ uint64_t tcps_pmtud_blackhole_activated; /* Black Hole Count */ uint64_t tcps_pmtud_blackhole_activated_min_mss; /* BH at min MSS Count */ uint64_t tcps_pmtud_blackhole_failed; /* Black Hole Failure Count */ uint64_t tcps_tunneled_pkts; /* Packets encap's in UDP received */ uint64_t tcps_tunneled_errs; /* Packets that had errors that were UDP encaped */ /* Dsack related stats */ uint64_t tcps_dsack_count; /* Number of ACKs arriving with DSACKs */ uint64_t tcps_dsack_bytes; /* Number of bytes DSACK'ed no TLP */ uint64_t tcps_dsack_tlp_bytes; /* Number of bytes DSACK'ed due to TLPs */ /* TCPS_TIME_WAIT usage stats */ uint64_t tcps_tw_recycles; /* Times time-wait was recycled. */ uint64_t tcps_tw_resets; /* Times time-wait sent a reset. */ uint64_t tcps_tw_responds; /* Times time-wait sent a valid ack. */ /* Accurate ECN Handshake stats */ uint64_t tcps_ace_nect; /* ACE SYN packet with Non-ECT */ uint64_t tcps_ace_ect1; /* ACE SYN packet with ECT1 */ uint64_t tcps_ace_ect0; /* ACE SYN packet with ECT0 */ uint64_t tcps_ace_ce; /* ACE SYN packet with CE */ /* ECN related stats */ uint64_t tcps_ecn_sndect0; /* ECN Capable Transport */ uint64_t tcps_ecn_sndect1; /* ECN Capable Transport */ /* * BBR and Rack implement TLP's these values count TLP bytes in * two catagories, bytes that were retransmitted and bytes that * were newly transmited. Both types can serve as TLP's but they * are accounted differently. */ uint64_t tcps_tlpresends; /* number of tlp resends */ uint64_t tcps_tlpresend_bytes; /* number of bytes resent by tlp */ - uint64_t _pad[4]; /* 4 TBD placeholder for STABLE */ + uint64_t _pad[3]; /* 3 TBD placeholder for STABLE */ }; #define tcps_rcvmemdrop tcps_rcvreassfull /* compat */ #ifdef _KERNEL #define TI_UNLOCKED 1 #define TI_RLOCKED 2 #include #include VNET_PCPUSTAT_DECLARE(struct tcpstat, tcpstat); /* tcp statistics */ /* * In-kernel consumers can use these accessor macros directly to update * stats. */ #define TCPSTAT_ADD(name, val) \ do { \ MIB_SDT_PROBE1(tcp, count, name, (val)); \ VNET_PCPUSTAT_ADD(struct tcpstat, tcpstat, name, (val)); \ } while (0) #define TCPSTAT_INC(name) TCPSTAT_ADD(name, 1) /* * Kernel module consumers must use this accessor macro. */ void kmod_tcpstat_add(int statnum, int val); #define KMOD_TCPSTAT_ADD(name, val) \ do { \ MIB_SDT_PROBE1(tcp, count, name, (val)); \ kmod_tcpstat_add(offsetof(struct tcpstat, name) / \ sizeof(uint64_t), \ val); \ } while (0) #define KMOD_TCPSTAT_INC(name) KMOD_TCPSTAT_ADD(name, 1) /* * Running TCP connection count by state. */ VNET_DECLARE(counter_u64_t, tcps_states[TCP_NSTATES]); #define V_tcps_states VNET(tcps_states) #define TCPSTATES_INC(state) counter_u64_add(V_tcps_states[state], 1) #define TCPSTATES_DEC(state) counter_u64_add(V_tcps_states[state], -1) /* * TCP specific helper hook point identifiers. */ #define HHOOK_TCP_EST_IN 0 #define HHOOK_TCP_EST_OUT 1 #define HHOOK_TCP_LAST HHOOK_TCP_EST_OUT struct tcp_hhook_data { struct tcpcb *tp; struct tcphdr *th; struct tcpopt *to; uint32_t len; int tso; tcp_seq curack; }; #ifdef TCP_HHOOK void hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, uint32_t len, int tso); #endif #endif /* * TCB structure exported to user-land via sysctl(3). * * Fields prefixed with "xt_" are unique to the export structure, and fields * with "t_" or other prefixes match corresponding fields of 'struct tcpcb'. * * Legend: * (s) - used by userland utilities in src * (p) - used by utilities in ports * (3) - is known to be used by third party software not in ports * (n) - no known usage * * Evil hack: declare only if in_pcb.h and sys/socketvar.h have been * included. Not all of our clients do. */ #if defined(_NETINET_IN_PCB_H_) && defined(_SYS_SOCKETVAR_H_) struct xtcpcb { ksize_t xt_len; /* length of this structure */ struct xinpcb xt_inp; char xt_stack[TCP_FUNCTION_NAME_LEN_MAX]; /* (s) */ char xt_logid[TCP_LOG_ID_LEN]; /* (s) */ char xt_cc[TCP_CA_NAME_MAX]; /* (s) */ int64_t spare64[6]; int32_t t_state; /* (s,p) */ uint32_t t_flags; /* (s,p) */ int32_t t_sndzerowin; /* (s) */ int32_t t_sndrexmitpack; /* (s) */ int32_t t_rcvoopack; /* (s) */ int32_t t_rcvtime; /* (s) */ int32_t tt_rexmt; /* (s) */ int32_t tt_persist; /* (s) */ int32_t tt_keep; /* (s) */ int32_t tt_2msl; /* (s) */ int32_t tt_delack; /* (s) */ int32_t t_logstate; /* (3) */ uint32_t t_snd_cwnd; /* (s) */ uint32_t t_snd_ssthresh; /* (s) */ uint32_t t_maxseg; /* (s) */ uint32_t t_rcv_wnd; /* (s) */ uint32_t t_snd_wnd; /* (s) */ uint32_t xt_ecn; /* (s) */ uint32_t t_dsack_bytes; /* (n) */ uint32_t t_dsack_tlp_bytes; /* (n) */ uint32_t t_dsack_pack; /* (n) */ uint16_t xt_encaps_port; /* (s) */ int16_t spare16; int32_t spare32[22]; } __aligned(8); #ifdef _KERNEL void tcp_inptoxtp(const struct inpcb *, struct xtcpcb *); #endif #endif /* * TCP function information (name-to-id mapping, aliases, and refcnt) * exported to user-land via sysctl(3). */ struct tcp_function_info { uint32_t tfi_refcnt; uint8_t tfi_id; char tfi_name[TCP_FUNCTION_NAME_LEN_MAX]; char tfi_alias[TCP_FUNCTION_NAME_LEN_MAX]; }; /* * Identifiers for TCP sysctl nodes */ #define TCPCTL_DO_RFC1323 1 /* use RFC-1323 extensions */ #define TCPCTL_MSSDFLT 3 /* MSS default */ #define TCPCTL_STATS 4 /* statistics */ #define TCPCTL_RTTDFLT 5 /* default RTT estimate */ #define TCPCTL_KEEPIDLE 6 /* keepalive idle timer */ #define TCPCTL_KEEPINTVL 7 /* interval to send keepalives */ #define TCPCTL_SENDSPACE 8 /* send buffer space */ #define TCPCTL_RECVSPACE 9 /* receive buffer space */ #define TCPCTL_KEEPINIT 10 /* timeout for establishing syn */ #define TCPCTL_PCBLIST 11 /* list of all outstanding PCBs */ #define TCPCTL_DELACKTIME 12 /* time before sending delayed ACK */ #define TCPCTL_V6MSSDFLT 13 /* MSS default for IPv6 */ #define TCPCTL_SACK 14 /* Selective Acknowledgement,rfc 2018 */ #define TCPCTL_DROP 15 /* drop tcp connection */ #define TCPCTL_STATES 16 /* connection counts by TCP state */ #ifdef _KERNEL #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet_tcp); SYSCTL_DECL(_net_inet_tcp_sack); MALLOC_DECLARE(M_TCPLOG); #endif VNET_DECLARE(int, tcp_log_in_vain); #define V_tcp_log_in_vain VNET(tcp_log_in_vain) /* * Global TCP tunables shared between different stacks. * Please keep the list sorted. */ VNET_DECLARE(int, drop_synfin); VNET_DECLARE(int, path_mtu_discovery); VNET_DECLARE(int, tcp_abc_l_var); VNET_DECLARE(int, tcp_autorcvbuf_max); VNET_DECLARE(int, tcp_autosndbuf_inc); VNET_DECLARE(int, tcp_autosndbuf_max); VNET_DECLARE(int, tcp_delack_enabled); VNET_DECLARE(int, tcp_do_autorcvbuf); VNET_DECLARE(int, tcp_do_autosndbuf); VNET_DECLARE(int, tcp_do_ecn); VNET_DECLARE(int, tcp_do_lrd); VNET_DECLARE(int, tcp_do_prr); VNET_DECLARE(int, tcp_do_prr_conservative); VNET_DECLARE(int, tcp_do_newcwv); VNET_DECLARE(int, tcp_do_rfc1323); VNET_DECLARE(int, tcp_tolerate_missing_ts); VNET_DECLARE(int, tcp_do_rfc3042); VNET_DECLARE(int, tcp_do_rfc3390); VNET_DECLARE(int, tcp_do_rfc3465); VNET_DECLARE(int, tcp_do_newsack); VNET_DECLARE(int, tcp_do_sack); VNET_DECLARE(int, tcp_do_tso); VNET_DECLARE(int, tcp_ecn_maxretries); VNET_DECLARE(int, tcp_initcwnd_segments); VNET_DECLARE(int, tcp_insecure_rst); VNET_DECLARE(int, tcp_insecure_syn); VNET_DECLARE(uint32_t, tcp_map_entries_limit); VNET_DECLARE(uint32_t, tcp_map_split_limit); VNET_DECLARE(int, tcp_minmss); VNET_DECLARE(int, tcp_mssdflt); #ifdef STATS VNET_DECLARE(int, tcp_perconn_stats_dflt_tpl); VNET_DECLARE(int, tcp_perconn_stats_enable); #endif /* STATS */ VNET_DECLARE(int, tcp_recvspace); VNET_DECLARE(int, tcp_retries); VNET_DECLARE(int, tcp_sack_globalholes); VNET_DECLARE(int, tcp_sack_globalmaxholes); VNET_DECLARE(int, tcp_sack_maxholes); VNET_DECLARE(int, tcp_sack_tso); VNET_DECLARE(int, tcp_sc_rst_sock_fail); VNET_DECLARE(int, tcp_sendspace); VNET_DECLARE(int, tcp_udp_tunneling_overhead); VNET_DECLARE(int, tcp_udp_tunneling_port); VNET_DECLARE(struct inpcbinfo, tcbinfo); #define V_tcp_do_lrd VNET(tcp_do_lrd) #define V_tcp_do_prr VNET(tcp_do_prr) #define V_tcp_do_newcwv VNET(tcp_do_newcwv) #define V_drop_synfin VNET(drop_synfin) #define V_path_mtu_discovery VNET(path_mtu_discovery) #define V_tcbinfo VNET(tcbinfo) #define V_tcp_abc_l_var VNET(tcp_abc_l_var) #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) #define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc) #define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) #define V_tcp_delack_enabled VNET(tcp_delack_enabled) #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) #define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) #define V_tcp_do_ecn VNET(tcp_do_ecn) #define V_tcp_do_rfc1323 VNET(tcp_do_rfc1323) #define V_tcp_tolerate_missing_ts VNET(tcp_tolerate_missing_ts) #define V_tcp_ts_offset_per_conn VNET(tcp_ts_offset_per_conn) #define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042) #define V_tcp_do_rfc3390 VNET(tcp_do_rfc3390) #define V_tcp_do_rfc3465 VNET(tcp_do_rfc3465) #define V_tcp_do_newsack VNET(tcp_do_newsack) #define V_tcp_do_sack VNET(tcp_do_sack) #define V_tcp_do_tso VNET(tcp_do_tso) #define V_tcp_ecn_maxretries VNET(tcp_ecn_maxretries) #define V_tcp_initcwnd_segments VNET(tcp_initcwnd_segments) #define V_tcp_insecure_rst VNET(tcp_insecure_rst) #define V_tcp_insecure_syn VNET(tcp_insecure_syn) #define V_tcp_map_entries_limit VNET(tcp_map_entries_limit) #define V_tcp_map_split_limit VNET(tcp_map_split_limit) #define V_tcp_minmss VNET(tcp_minmss) #define V_tcp_mssdflt VNET(tcp_mssdflt) #ifdef STATS #define V_tcp_perconn_stats_dflt_tpl VNET(tcp_perconn_stats_dflt_tpl) #define V_tcp_perconn_stats_enable VNET(tcp_perconn_stats_enable) #endif /* STATS */ #define V_tcp_recvspace VNET(tcp_recvspace) #define V_tcp_retries VNET(tcp_retries) #define V_tcp_sack_globalholes VNET(tcp_sack_globalholes) #define V_tcp_sack_globalmaxholes VNET(tcp_sack_globalmaxholes) #define V_tcp_sack_maxholes VNET(tcp_sack_maxholes) #define V_tcp_sack_tso VNET(tcp_sack_tso) #define V_tcp_sc_rst_sock_fail VNET(tcp_sc_rst_sock_fail) #define V_tcp_sendspace VNET(tcp_sendspace) #define V_tcp_udp_tunneling_overhead VNET(tcp_udp_tunneling_overhead) #define V_tcp_udp_tunneling_port VNET(tcp_udp_tunneling_port) #ifdef TCP_HHOOK VNET_DECLARE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST + 1]); #define V_tcp_hhh VNET(tcp_hhh) #endif void tcp_account_for_send(struct tcpcb *, uint32_t, uint8_t, uint8_t, bool); int tcp_addoptions(struct tcpopt *, u_char *); struct tcpcb * tcp_close(struct tcpcb *); void tcp_discardcb(struct tcpcb *); void tcp_twstart(struct tcpcb *); int tcp_ctloutput(struct socket *, struct sockopt *); void tcp_fini(void *); char *tcp_log_addrs(struct in_conninfo *, struct tcphdr *, const void *, const void *); char *tcp_log_vain(struct in_conninfo *, struct tcphdr *, const void *, const void *); int tcp_reass(struct tcpcb *, struct tcphdr *, tcp_seq *, int *, struct mbuf *); void tcp_reass_global_init(void); void tcp_reass_flush(struct tcpcb *); void tcp_dooptions(struct tcpopt *, u_char *, int, int); void tcp_dropwithreset(struct mbuf *, struct tcphdr *, struct tcpcb *, int, int); void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); void tcp_xmit_timer(struct tcpcb *, int); void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); void cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t nsegs, uint16_t type); void cc_conn_init(struct tcpcb *tp); void cc_post_recovery(struct tcpcb *tp, struct tcphdr *th); void cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos); void cc_ecnpkt_handler_flags(struct tcpcb *tp, uint16_t flags, uint8_t iptos); void cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type); #ifdef TCP_HHOOK void hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to); #endif int tcp_input(struct mbuf **, int *, int); int tcp_autorcvbuf(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int); int tcp_input_with_port(struct mbuf **, int *, int, uint16_t); void tcp_do_segment(struct tcpcb *, struct mbuf *, struct tcphdr *, int, int, uint8_t); int register_tcp_functions(struct tcp_function_block *blk, int wait); int register_tcp_functions_as_names(struct tcp_function_block *blk, int wait, const char *names[], int *num_names); int register_tcp_functions_as_name(struct tcp_function_block *blk, const char *name, int wait); int deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce, bool force); struct tcp_function_block *find_and_ref_tcp_functions(struct tcp_function_set *fs); int find_tcp_function_alias(struct tcp_function_block *blk, struct tcp_function_set *fs); uint32_t tcp_get_srtt(struct tcpcb *tp, int granularity); void tcp_switch_back_to_default(struct tcpcb *tp); struct tcp_function_block * find_and_ref_tcp_fb(struct tcp_function_block *fs); int tcp_default_ctloutput(struct tcpcb *tp, struct sockopt *sopt); int tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt); void tcp_log_socket_option(struct tcpcb *tp, uint32_t option_num, uint32_t option_val, int err); extern counter_u64_t tcp_inp_lro_direct_queue; extern counter_u64_t tcp_inp_lro_wokeup_queue; extern counter_u64_t tcp_inp_lro_compressed; extern counter_u64_t tcp_inp_lro_locks_taken; extern counter_u64_t tcp_extra_mbuf; extern counter_u64_t tcp_would_have_but; extern counter_u64_t tcp_comp_total; extern counter_u64_t tcp_uncomp_total; extern counter_u64_t tcp_bad_csums; #ifdef TCP_SAD_DETECTION /* Various SACK attack thresholds */ extern int32_t tcp_force_detection; extern int32_t tcp_sad_limit; extern int32_t tcp_sack_to_ack_thresh; extern int32_t tcp_sack_to_move_thresh; extern int32_t tcp_restoral_thresh; extern int32_t tcp_sad_decay_val; extern int32_t tcp_sad_pacing_interval; extern int32_t tcp_sad_low_pps; extern int32_t tcp_map_minimum; extern int32_t tcp_attack_on_turns_on_logging; #endif extern uint32_t tcp_ack_war_time_window; extern uint32_t tcp_ack_war_cnt; /* * Used by tcp_maxmtu() to communicate interface specific features * and limits at the time of connection setup. */ struct tcp_ifcap { int ifcap; u_int tsomax; u_int tsomaxsegcount; u_int tsomaxsegsize; }; uint32_t tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *); uint32_t tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *); void tcp6_use_min_mtu(struct tcpcb *); u_int tcp_maxseg(const struct tcpcb *); u_int tcp_fixed_maxseg(const struct tcpcb *); void tcp_mss_update(struct tcpcb *, int, int, struct hc_metrics_lite *, struct tcp_ifcap *); void tcp_mss(struct tcpcb *, int); int tcp_mssopt(struct in_conninfo *); struct tcpcb * tcp_newtcpcb(struct inpcb *); int tcp_default_output(struct tcpcb *); void tcp_state_change(struct tcpcb *, int); void tcp_respond(struct tcpcb *, void *, struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, uint16_t); bool tcp_twcheck(struct inpcb *, struct tcpopt *, struct tcphdr *, struct mbuf *, int); void tcp_setpersist(struct tcpcb *); void tcp_record_dsack(struct tcpcb *tp, tcp_seq start, tcp_seq end, int tlp); struct tcptemp * tcpip_maketemplate(struct inpcb *); void tcpip_fillheaders(struct inpcb *, uint16_t, void *, void *); void tcp_timer_activate(struct tcpcb *, tt_which, u_int); bool tcp_timer_active(struct tcpcb *, tt_which); void tcp_timer_stop(struct tcpcb *); int inp_to_cpuid(struct inpcb *inp); /* * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo) */ void tcp_hc_init(void); #ifdef VIMAGE void tcp_hc_destroy(void); #endif void tcp_hc_get(struct in_conninfo *, struct hc_metrics_lite *); uint32_t tcp_hc_getmtu(struct in_conninfo *); void tcp_hc_updatemtu(struct in_conninfo *, uint32_t); void tcp_hc_update(struct in_conninfo *, struct hc_metrics_lite *); void cc_after_idle(struct tcpcb *tp); extern struct protosw tcp_protosw; /* shared for TOE */ extern struct protosw tcp6_protosw; /* shared for TOE */ uint32_t tcp_new_ts_offset(struct in_conninfo *); tcp_seq tcp_new_isn(struct in_conninfo *); sackstatus_t tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq); int tcp_dsack_block_exists(struct tcpcb *); void tcp_update_dsack_list(struct tcpcb *, tcp_seq, tcp_seq); void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend); void tcp_clean_dsack_blocks(struct tcpcb *tp); void tcp_clean_sackreport(struct tcpcb *tp); void tcp_sack_adjust(struct tcpcb *tp); struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt); void tcp_do_prr_ack(struct tcpcb *, struct tcphdr *, struct tcpopt *, sackstatus_t, u_int *); void tcp_lost_retransmission(struct tcpcb *, struct tcphdr *); void tcp_sack_partialack(struct tcpcb *, struct tcphdr *, u_int *); void tcp_resend_sackholes(struct tcpcb *tp); void tcp_free_sackholes(struct tcpcb *tp); void tcp_sack_lost_retransmission(struct tcpcb *, struct tcphdr *); int tcp_newreno(struct tcpcb *, struct tcphdr *); int tcp_compute_pipe(struct tcpcb *); uint32_t tcp_compute_initwnd(uint32_t); void tcp_sndbuf_autoscale(struct tcpcb *, struct socket *, uint32_t); int tcp_stats_sample_rollthedice(struct tcpcb *tp, void *seed_bytes, size_t seed_len); int tcp_can_enable_pacing(void); int tcp_incr_dgp_pacing_cnt(void); void tcp_dec_dgp_pacing_cnt(void); void tcp_decrement_paced_conn(void); void tcp_change_time_units(struct tcpcb *, int); void tcp_handle_orphaned_packets(struct tcpcb *); struct mbuf * tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen, int32_t seglimit, int32_t segsize, struct sockbuf *sb, bool hw_tls); int tcp_stats_init(void); void tcp_log_end_status(struct tcpcb *tp, uint8_t status); #ifdef TCP_REQUEST_TRK void tcp_req_free_a_slot(struct tcpcb *tp, struct tcp_sendfile_track *ent); struct tcp_sendfile_track * tcp_req_find_a_req_that_is_completed_by(struct tcpcb *tp, tcp_seq th_ack, int *ip); int tcp_req_check_for_comp(struct tcpcb *tp, tcp_seq ack_point); int tcp_req_is_entry_comp(struct tcpcb *tp, struct tcp_sendfile_track *ent, tcp_seq ack_point); struct tcp_sendfile_track * tcp_req_find_req_for_seq(struct tcpcb *tp, tcp_seq seq); void tcp_req_log_req_info(struct tcpcb *tp, struct tcp_sendfile_track *req, uint16_t slot, uint8_t val, uint64_t offset, uint64_t nbytes); uint32_t tcp_estimate_tls_overhead(struct socket *so, uint64_t tls_usr_bytes); void tcp_req_alloc_req(struct tcpcb *tp, union tcp_log_userdata *user, uint64_t ts); struct tcp_sendfile_track * tcp_req_alloc_req_full(struct tcpcb *tp, struct tcp_snd_req *req, uint64_t ts, int rec_dups); #endif #ifdef TCP_ACCOUNTING int tcp_do_ack_accounting(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, uint32_t tiwin, int mss); #endif static inline void tcp_lro_features_off(struct tcpcb *tp) { tp->t_flags2 &= ~(TF2_SUPPORTS_MBUFQ| TF2_MBUF_QUEUE_READY| TF2_DONT_SACK_QUEUE| TF2_MBUF_ACKCMP| TF2_MBUF_L_ACKS); } static inline void tcp_fields_to_host(struct tcphdr *th) { th->th_seq = ntohl(th->th_seq); th->th_ack = ntohl(th->th_ack); th->th_win = ntohs(th->th_win); th->th_urp = ntohs(th->th_urp); } static inline void tcp_fields_to_net(struct tcphdr *th) { th->th_seq = htonl(th->th_seq); th->th_ack = htonl(th->th_ack); th->th_win = htons(th->th_win); th->th_urp = htons(th->th_urp); } #endif /* _KERNEL */ #endif /* _NETINET_TCP_VAR_H_ */ diff --git a/usr.bin/netstat/inet.c b/usr.bin/netstat/inet.c index 0657926eab80..5d7fd0e46cf1 100644 --- a/usr.bin/netstat/inet.c +++ b/usr.bin/netstat/inet.c @@ -1,1533 +1,1535 @@ /*- * Copyright (c) 1983, 1988, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #define _WANT_SOCKET #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif /* INET6 */ #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "netstat.h" #include "nl_defs.h" #define max(a, b) (((a) > (b)) ? (a) : (b)) #ifdef INET static void inetprint(const char *, struct in_addr *, int, const char *, int, const int); #endif #ifdef INET6 static int udp_done, tcp_done, sdp_done; #endif /* INET6 */ static int pcblist_sysctl(int proto, const char *name, char **bufp) { const char *mibvar; char *buf; size_t len; switch (proto) { case IPPROTO_TCP: mibvar = "net.inet.tcp.pcblist"; break; case IPPROTO_UDP: mibvar = "net.inet.udp.pcblist"; break; default: mibvar = "net.inet.raw.pcblist"; break; } if (strncmp(name, "sdp", 3) == 0) mibvar = "net.inet.sdp.pcblist"; else if (strncmp(name, "divert", 6) == 0) mibvar = "net.inet.divert.pcblist"; len = 0; if (sysctlbyname(mibvar, 0, &len, 0, 0) < 0) { if (errno != ENOENT) xo_warn("sysctl: %s", mibvar); return (0); } if ((buf = malloc(len)) == NULL) { xo_warnx("malloc %lu bytes", (u_long)len); return (0); } if (sysctlbyname(mibvar, buf, &len, 0, 0) < 0) { xo_warn("sysctl: %s", mibvar); free(buf); return (0); } *bufp = buf; return (1); } /* * Copied directly from uipc_socket2.c. We leave out some fields that are in * nested structures that aren't used to avoid extra work. */ static void sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb) { xsb->sb_cc = sb->sb_ccc; xsb->sb_hiwat = sb->sb_hiwat; xsb->sb_mbcnt = sb->sb_mbcnt; xsb->sb_mbmax = sb->sb_mbmax; xsb->sb_lowat = sb->sb_lowat; xsb->sb_flags = sb->sb_flags; xsb->sb_timeo = sb->sb_timeo; } int sotoxsocket(struct socket *so, struct xsocket *xso) { struct protosw proto; struct domain domain; bzero(xso, sizeof *xso); xso->xso_len = sizeof *xso; xso->xso_so = (uintptr_t)so; xso->so_type = so->so_type; xso->so_options = so->so_options; xso->so_linger = so->so_linger; xso->so_state = so->so_state; xso->so_pcb = (uintptr_t)so->so_pcb; if (kread((uintptr_t)so->so_proto, &proto, sizeof(proto)) != 0) return (-1); xso->xso_protocol = proto.pr_protocol; if (kread((uintptr_t)proto.pr_domain, &domain, sizeof(domain)) != 0) return (-1); xso->xso_family = domain.dom_family; xso->so_timeo = so->so_timeo; xso->so_error = so->so_error; if ((so->so_options & SO_ACCEPTCONN) != 0) { xso->so_qlen = so->sol_qlen; xso->so_incqlen = so->sol_incqlen; xso->so_qlimit = so->sol_qlimit; } else { sbtoxsockbuf(&so->so_snd, &xso->so_snd); sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); xso->so_oobmark = so->so_oobmark; } return (0); } /* * Print a summary of connections related to an Internet * protocol. For TCP, also give state of connection. * Listening processes (aflag) are suppressed unless the * -a (all) flag is specified. */ void protopr(u_long off, const char *name, int af1, int proto) { static int first = 1; int istcp; char *buf; const char *vchar; struct xtcpcb *tp; struct xinpcb *inp; struct xinpgen *xig, *oxig; struct xsocket *so; int fnamelen, cnamelen; istcp = 0; switch (proto) { case IPPROTO_TCP: #ifdef INET6 if (strncmp(name, "sdp", 3) != 0) { if (tcp_done != 0) return; else tcp_done = 1; } else { if (sdp_done != 0) return; else sdp_done = 1; } #endif istcp = 1; break; case IPPROTO_UDP: #ifdef INET6 if (udp_done != 0) return; else udp_done = 1; #endif break; } if (!pcblist_sysctl(proto, name, &buf)) return; if (istcp && (cflag || Cflag)) { fnamelen = strlen("Stack"); cnamelen = strlen("CC"); oxig = xig = (struct xinpgen *)buf; for (xig = (struct xinpgen*)((char *)xig + xig->xig_len); xig->xig_len > sizeof(struct xinpgen); xig = (struct xinpgen *)((char *)xig + xig->xig_len)) { tp = (struct xtcpcb *)xig; inp = &tp->xt_inp; if (inp->inp_gencnt > oxig->xig_gen) continue; so = &inp->xi_socket; if (so->xso_protocol != proto) continue; fnamelen = max(fnamelen, (int)strlen(tp->xt_stack)); cnamelen = max(cnamelen, (int)strlen(tp->xt_cc)); } } oxig = xig = (struct xinpgen *)buf; for (xig = (struct xinpgen *)((char *)xig + xig->xig_len); xig->xig_len > sizeof(struct xinpgen); xig = (struct xinpgen *)((char *)xig + xig->xig_len)) { if (istcp) { tp = (struct xtcpcb *)xig; inp = &tp->xt_inp; } else { inp = (struct xinpcb *)xig; } so = &inp->xi_socket; /* Ignore sockets for protocols other than the desired one. */ if (proto != 0 && so->xso_protocol != proto) continue; /* Ignore PCBs which were freed during copyout. */ if (inp->inp_gencnt > oxig->xig_gen) continue; if ((af1 == AF_INET && (inp->inp_vflag & INP_IPV4) == 0) #ifdef INET6 || (af1 == AF_INET6 && (inp->inp_vflag & INP_IPV6) == 0) #endif /* INET6 */ || (af1 == AF_UNSPEC && ((inp->inp_vflag & INP_IPV4) == 0 #ifdef INET6 && (inp->inp_vflag & INP_IPV6) == 0 #endif /* INET6 */ )) ) continue; if (!aflag && ( (istcp && tp->t_state == TCPS_LISTEN) || (af1 == AF_INET && inp->inp_laddr.s_addr == INADDR_ANY) #ifdef INET6 || (af1 == AF_INET6 && IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) #endif /* INET6 */ || (af1 == AF_UNSPEC && (((inp->inp_vflag & INP_IPV4) != 0 && inp->inp_laddr.s_addr == INADDR_ANY) #ifdef INET6 || ((inp->inp_vflag & INP_IPV6) != 0 && IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) #endif )) )) continue; if (first) { if (!Lflag) { xo_emit("Active Internet connections"); if (aflag) xo_emit(" (including servers)"); } else xo_emit( "Current listen queue sizes (qlen/incqlen/maxqlen)"); xo_emit("\n"); if (Aflag) xo_emit("{T:/%-*s} ", 2 * (int)sizeof(void *), "Tcpcb"); if (Lflag) xo_emit((Aflag && !Wflag) ? "{T:/%-5.5s} {T:/%-32.32s} {T:/%-18.18s}" : ((!Wflag || af1 == AF_INET) ? "{T:/%-5.5s} {T:/%-32.32s} {T:/%-22.22s}" : "{T:/%-5.5s} {T:/%-32.32s} {T:/%-45.45s}"), "Proto", "Listen", "Local Address"); else if (Tflag) xo_emit((Aflag && !Wflag) ? "{T:/%-5.5s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-18.18s} {T:/%s}" : ((!Wflag || af1 == AF_INET) ? "{T:/%-5.5s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-22.22s} {T:/%s}" : "{T:/%-5.5s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-45.45s} {T:/%s}"), "Proto", "Rexmit", "OOORcv", "0-win", "Local Address", "Foreign Address"); else { xo_emit((Aflag && !Wflag) ? "{T:/%-5.5s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-18.18s} {T:/%-18.18s}" : ((!Wflag || af1 == AF_INET) ? "{T:/%-5.5s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-22.22s} {T:/%-22.22s}" : "{T:/%-5.5s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-45.45s} {T:/%-45.45s}"), "Proto", "Recv-Q", "Send-Q", "Local Address", "Foreign Address"); if (!xflag && !Rflag) xo_emit(" {T:/%-11.11s}", "(state)"); } if (xflag) { xo_emit("{T:/%-6.6s} {T:/%-6.6s} " "{T:/%-6.6s} {T:/%-6.6s} {T:/%-6.6s} " "{T:/%-6.6s} {T:/%-6.6s} {T:/%-6.6s}", "R-HIWA", "S-HIWA", "R-LOWA", "S-LOWA", "R-BCNT", "S-BCNT", "R-BMAX", "S-BMAX"); xo_emit(" {T:/%7.7s} {T:/%7.7s} {T:/%7.7s} " "{T:/%7.7s} {T:/%7.7s} {T:/%7.7s}", "rexmt", "persist", "keep", "2msl", "delack", "rcvtime"); } else if (Rflag) { xo_emit(" {T:/%8.8s} {T:/%5.5s}", "flowid", "ftype"); } if (cflag) { xo_emit(" {T:/%-*.*s}", fnamelen, fnamelen, "Stack"); } if (Cflag) xo_emit(" {T:/%-*.*s} {T:/%10.10s}" " {T:/%10.10s} {T:/%5.5s}" " {T:/%3.3s}", cnamelen, cnamelen, "CC", "cwin", "ssthresh", "MSS", "ECN"); if (Pflag) xo_emit(" {T:/%s}", "Log ID"); xo_emit("\n"); first = 0; } if (Lflag && so->so_qlimit == 0) continue; xo_open_instance("socket"); if (Aflag) xo_emit("{q:address/%*lx} ", 2 * (int)sizeof(void *), (u_long)so->so_pcb); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) vchar = ((inp->inp_vflag & INP_IPV4) != 0) ? "46" : "6"; else #endif vchar = ((inp->inp_vflag & INP_IPV4) != 0) ? "4" : ""; if (istcp && (tp->t_flags & TF_TOE) != 0) xo_emit("{:protocol/%-3.3s%-2.2s/%s%s} ", "toe", vchar); else xo_emit("{:protocol/%-3.3s%-2.2s/%s%s} ", name, vchar); if (Lflag) { char buf1[33]; snprintf(buf1, sizeof buf1, "%u/%u/%u", so->so_qlen, so->so_incqlen, so->so_qlimit); xo_emit("{:listen-queue-sizes/%-32.32s} ", buf1); } else if (Tflag) { if (istcp) xo_emit("{:sent-retransmit-packets/%6u} " "{:received-out-of-order-packets/%6u} " "{:sent-zero-window/%6u} ", tp->t_sndrexmitpack, tp->t_rcvoopack, tp->t_sndzerowin); else xo_emit("{P:/%21s}", ""); } else { xo_emit("{:receive-bytes-waiting/%6u} " "{:send-bytes-waiting/%6u} ", so->so_rcv.sb_cc, so->so_snd.sb_cc); } if (numeric_port) { #ifdef INET if (inp->inp_vflag & INP_IPV4) { inetprint("local", &inp->inp_laddr, (int)inp->inp_lport, name, 1, af1); if (!Lflag) inetprint("remote", &inp->inp_faddr, (int)inp->inp_fport, name, 1, af1); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { inet6print("local", &inp->in6p_laddr, (int)inp->inp_lport, name, 1); if (!Lflag) inet6print("remote", &inp->in6p_faddr, (int)inp->inp_fport, name, 1); } /* else nothing printed now */ #endif /* INET6 */ } else if (inp->inp_flags & INP_ANONPORT) { #ifdef INET if (inp->inp_vflag & INP_IPV4) { inetprint("local", &inp->inp_laddr, (int)inp->inp_lport, name, 1, af1); if (!Lflag) inetprint("remote", &inp->inp_faddr, (int)inp->inp_fport, name, 0, af1); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { inet6print("local", &inp->in6p_laddr, (int)inp->inp_lport, name, 1); if (!Lflag) inet6print("remote", &inp->in6p_faddr, (int)inp->inp_fport, name, 0); } /* else nothing printed now */ #endif /* INET6 */ } else { #ifdef INET if (inp->inp_vflag & INP_IPV4) { inetprint("local", &inp->inp_laddr, (int)inp->inp_lport, name, 0, af1); if (!Lflag) inetprint("remote", &inp->inp_faddr, (int)inp->inp_fport, name, inp->inp_lport != inp->inp_fport, af1); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { inet6print("local", &inp->in6p_laddr, (int)inp->inp_lport, name, 0); if (!Lflag) inet6print("remote", &inp->in6p_faddr, (int)inp->inp_fport, name, inp->inp_lport != inp->inp_fport); } /* else nothing printed now */ #endif /* INET6 */ } if (xflag) { xo_emit("{:receive-high-water/%6u} " "{:send-high-water/%6u} " "{:receive-low-water/%6u} {:send-low-water/%6u} " "{:receive-mbuf-bytes/%6u} {:send-mbuf-bytes/%6u} " "{:receive-mbuf-bytes-max/%6u} " "{:send-mbuf-bytes-max/%6u}", so->so_rcv.sb_hiwat, so->so_snd.sb_hiwat, so->so_rcv.sb_lowat, so->so_snd.sb_lowat, so->so_rcv.sb_mbcnt, so->so_snd.sb_mbcnt, so->so_rcv.sb_mbmax, so->so_snd.sb_mbmax); if (istcp) xo_emit(" {:retransmit-timer/%4d.%02d} " "{:persist-timer/%4d.%02d} " "{:keepalive-timer/%4d.%02d} " "{:msl2-timer/%4d.%02d} " "{:delay-ack-timer/%4d.%02d} " "{:inactivity-timer/%4d.%02d}", tp->tt_rexmt / 1000, (tp->tt_rexmt % 1000) / 10, tp->tt_persist / 1000, (tp->tt_persist % 1000) / 10, tp->tt_keep / 1000, (tp->tt_keep % 1000) / 10, tp->tt_2msl / 1000, (tp->tt_2msl % 1000) / 10, tp->tt_delack / 1000, (tp->tt_delack % 1000) / 10, tp->t_rcvtime / 1000, (tp->t_rcvtime % 1000) / 10); } if (istcp && !Lflag && !xflag && !Tflag && !Rflag) { if (tp->t_state < 0 || tp->t_state >= TCP_NSTATES) xo_emit("{:tcp-state/%-11d}", tp->t_state); else { xo_emit("{:tcp-state/%-11s}", tcpstates[tp->t_state]); #if defined(TF_NEEDSYN) && defined(TF_NEEDFIN) /* Show T/TCP `hidden state' */ if (tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) xo_emit("{:need-syn-or-fin/*}"); #endif /* defined(TF_NEEDSYN) && defined(TF_NEEDFIN) */ } } if (Rflag) { /* XXX: is this right Alfred */ xo_emit(" {:flow-id/%08x} {:flow-type/%5d}", inp->inp_flowid, inp->inp_flowtype); } if (istcp) { if (cflag) xo_emit(" {:stack/%-*.*s}", fnamelen, fnamelen, tp->xt_stack); if (Cflag) xo_emit(" {:cc/%-*.*s}" " {:snd-cwnd/%10lu}" " {:snd-ssthresh/%10lu}" " {:t-maxseg/%5u} {:ecn/%3s}", cnamelen, cnamelen, tp->xt_cc, tp->t_snd_cwnd, tp->t_snd_ssthresh, tp->t_maxseg, (tp->t_state >= TCPS_ESTABLISHED ? (tp->xt_ecn > 0 ? (tp->xt_ecn == 1 ? "ecn" : "ace") : "off") : "n/a")); if (Pflag) xo_emit(" {:log-id/%s}", tp->xt_logid[0] == '\0' ? "-" : tp->xt_logid); } xo_emit("\n"); xo_close_instance("socket"); } if (xig != oxig && xig->xig_gen != oxig->xig_gen) { if (oxig->xig_count > xig->xig_count) { xo_emit("Some {d:lost/%s} sockets may have been " "deleted.\n", name); } else if (oxig->xig_count < xig->xig_count) { xo_emit("Some {d:created/%s} sockets may have been " "created.\n", name); } else { xo_emit("Some {d:changed/%s} sockets may have been " "created or deleted.\n", name); } } free(buf); } /* * Dump TCP statistics structure. */ void tcp_stats(u_long off, const char *name, int af1 __unused, int proto __unused) { struct tcpstat tcpstat; uint64_t tcps_states[TCP_NSTATES]; #ifdef INET6 if (tcp_done != 0) return; else tcp_done = 1; #endif if (fetch_stats("net.inet.tcp.stats", off, &tcpstat, sizeof(tcpstat), kread_counters) != 0) return; if (fetch_stats_ro("net.inet.tcp.states", nl[N_TCPS_STATES].n_value, &tcps_states, sizeof(tcps_states), kread_counters) != 0) return; xo_open_container("tcp"); xo_emit("{T:/%s}:\n", name); #define p(f, m) if (tcpstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t )tcpstat.f, plural(tcpstat.f)) #define p1a(f, m) if (tcpstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t )tcpstat.f) #define p2(f1, f2, m) if (tcpstat.f1 || tcpstat.f2 || sflag <= 1) \ xo_emit(m, (uintmax_t )tcpstat.f1, plural(tcpstat.f1), \ (uintmax_t )tcpstat.f2, plural(tcpstat.f2)) #define p2a(f1, f2, m) if (tcpstat.f1 || tcpstat.f2 || sflag <= 1) \ xo_emit(m, (uintmax_t )tcpstat.f1, plural(tcpstat.f1), \ (uintmax_t )tcpstat.f2) #define p3(f, m) if (tcpstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t )tcpstat.f, pluralies(tcpstat.f)) p(tcps_sndtotal, "\t{:sent-packets/%ju} {N:/packet%s sent}\n"); p2(tcps_sndpack,tcps_sndbyte, "\t\t{:sent-data-packets/%ju} " "{N:/data packet%s} ({:sent-data-bytes/%ju} {N:/byte%s})\n"); p2(tcps_sndrexmitpack, tcps_sndrexmitbyte, "\t\t" "{:sent-retransmitted-packets/%ju} {N:/data packet%s} " "({:sent-retransmitted-bytes/%ju} {N:/byte%s}) " "{N:retransmitted}\n"); p(tcps_sndrexmitbad, "\t\t" "{:sent-unnecessary-retransmitted-packets/%ju} " "{N:/data packet%s unnecessarily retransmitted}\n"); p(tcps_mturesent, "\t\t{:sent-resends-by-mtu-discovery/%ju} " "{N:/resend%s initiated by MTU discovery}\n"); p2a(tcps_sndacks, tcps_delack, "\t\t{:sent-ack-only-packets/%ju} " "{N:/ack-only packet%s/} ({:sent-packets-delayed/%ju} " "{N:delayed})\n"); p(tcps_sndurg, "\t\t{:sent-urg-only-packets/%ju} " "{N:/URG only packet%s}\n"); p(tcps_sndprobe, "\t\t{:sent-window-probe-packets/%ju} " "{N:/window probe packet%s}\n"); p(tcps_sndwinup, "\t\t{:sent-window-update-packets/%ju} " "{N:/window update packet%s}\n"); p(tcps_sndctrl, "\t\t{:sent-control-packets/%ju} " "{N:/control packet%s}\n"); p(tcps_rcvtotal, "\t{:received-packets/%ju} " "{N:/packet%s received}\n"); p2(tcps_rcvackpack, tcps_rcvackbyte, "\t\t" "{:received-ack-packets/%ju} {N:/ack%s} " "{N:(for} {:received-ack-bytes/%ju} {N:/byte%s})\n"); p(tcps_rcvdupack, "\t\t{:received-duplicate-acks/%ju} " "{N:/duplicate ack%s}\n"); p(tcps_tunneled_pkts, "\t\t{:received-udp-tunneled-pkts/%ju} " "{N:/UDP tunneled pkt%s}\n"); p(tcps_tunneled_errs, "\t\t{:received-bad-udp-tunneled-pkts/%ju} " "{N:/UDP tunneled pkt cnt with error%s}\n"); p(tcps_rcvacktoomuch, "\t\t{:received-acks-for-unsent-data/%ju} " "{N:/ack%s for unsent data}\n"); p2(tcps_rcvpack, tcps_rcvbyte, "\t\t" "{:received-in-sequence-packets/%ju} {N:/packet%s} " "({:received-in-sequence-bytes/%ju} {N:/byte%s}) " "{N:received in-sequence}\n"); p2(tcps_rcvduppack, tcps_rcvdupbyte, "\t\t" "{:received-completely-duplicate-packets/%ju} " "{N:/completely duplicate packet%s} " "({:received-completely-duplicate-bytes/%ju} {N:/byte%s})\n"); p(tcps_pawsdrop, "\t\t{:received-old-duplicate-packets/%ju} " "{N:/old duplicate packet%s}\n"); p2(tcps_rcvpartduppack, tcps_rcvpartdupbyte, "\t\t" "{:received-some-duplicate-packets/%ju} " "{N:/packet%s with some dup. data} " "({:received-some-duplicate-bytes/%ju} {N:/byte%s duped/})\n"); p2(tcps_rcvoopack, tcps_rcvoobyte, "\t\t{:received-out-of-order/%ju} " "{N:/out-of-order packet%s} " "({:received-out-of-order-bytes/%ju} {N:/byte%s})\n"); p2(tcps_rcvpackafterwin, tcps_rcvbyteafterwin, "\t\t" "{:received-after-window-packets/%ju} {N:/packet%s} " "({:received-after-window-bytes/%ju} {N:/byte%s}) " "{N:of data after window}\n"); p(tcps_rcvwinprobe, "\t\t{:received-window-probes/%ju} " "{N:/window probe%s}\n"); p(tcps_rcvwinupd, "\t\t{:receive-window-update-packets/%ju} " "{N:/window update packet%s}\n"); p(tcps_dsack_count, "\t\t{:received-with-dsack-packets/%ju} " "{N:/packet%s received with dsack}\n"); p(tcps_dsack_bytes, "\t\t{:received-with-dsack-bytes/%ju} " "{N:/dsack byte%s received (no TLP involved)}\n"); p(tcps_dsack_tlp_bytes, "\t\t{:received-with-dsack-bytes-tlp/%ju} " "{N:/dsack byte%s received (TLP responsible)}\n"); p(tcps_rcvafterclose, "\t\t{:received-after-close-packets/%ju} " "{N:/packet%s received after close}\n"); p(tcps_rcvbadsum, "\t\t{:discard-bad-checksum/%ju} " "{N:/discarded for bad checksum%s}\n"); p(tcps_rcvbadoff, "\t\t{:discard-bad-header-offset/%ju} " "{N:/discarded for bad header offset field%s}\n"); p1a(tcps_rcvshort, "\t\t{:discard-too-short/%ju} " "{N:discarded because packet too short}\n"); p1a(tcps_rcvreassfull, "\t\t{:discard-reassembly-queue-full/%ju} " "{N:discarded due to full reassembly queue}\n"); p(tcps_connattempt, "\t{:connection-requests/%ju} " "{N:/connection request%s}\n"); p(tcps_accepts, "\t{:connections-accepts/%ju} " "{N:/connection accept%s}\n"); p(tcps_badsyn, "\t{:bad-connection-attempts/%ju} " "{N:/bad connection attempt%s}\n"); p(tcps_listendrop, "\t{:listen-queue-overflows/%ju} " "{N:/listen queue overflow%s}\n"); p(tcps_badrst, "\t{:ignored-in-window-resets/%ju} " "{N:/ignored RSTs in the window%s}\n"); p(tcps_connects, "\t{:connections-established/%ju} " "{N:/connection%s established (including accepts)}\n"); p(tcps_usedrtt, "\t\t{:connections-hostcache-rtt/%ju} " "{N:/time%s used RTT from hostcache}\n"); p(tcps_usedrttvar, "\t\t{:connections-hostcache-rttvar/%ju} " "{N:/time%s used RTT variance from hostcache}\n"); p(tcps_usedssthresh, "\t\t{:connections-hostcache-ssthresh/%ju} " "{N:/time%s used slow-start threshold from hostcache}\n"); p2(tcps_closed, tcps_drops, "\t{:connections-closed/%ju} " "{N:/connection%s closed (including} " "{:connection-drops/%ju} {N:/drop%s})\n"); p(tcps_cachedrtt, "\t\t{:connections-updated-rtt-on-close/%ju} " "{N:/connection%s updated cached RTT on close}\n"); p(tcps_cachedrttvar, "\t\t" "{:connections-updated-variance-on-close/%ju} " "{N:/connection%s updated cached RTT variance on close}\n"); p(tcps_cachedssthresh, "\t\t" "{:connections-updated-ssthresh-on-close/%ju} " "{N:/connection%s updated cached ssthresh on close}\n"); p(tcps_conndrops, "\t{:embryonic-connections-dropped/%ju} " "{N:/embryonic connection%s dropped}\n"); p2(tcps_rttupdated, tcps_segstimed, "\t{:segments-updated-rtt/%ju} " "{N:/segment%s updated rtt (of} " "{:segment-update-attempts/%ju} {N:/attempt%s})\n"); p(tcps_rexmttimeo, "\t{:retransmit-timeouts/%ju} " "{N:/retransmit timeout%s}\n"); p(tcps_timeoutdrop, "\t\t" "{:connections-dropped-by-retransmit-timeout/%ju} " "{N:/connection%s dropped by rexmit timeout}\n"); p(tcps_persisttimeo, "\t{:persist-timeout/%ju} " "{N:/persist timeout%s}\n"); p(tcps_persistdrop, "\t\t" "{:connections-dropped-by-persist-timeout/%ju} " "{N:/connection%s dropped by persist timeout}\n"); p(tcps_finwait2_drops, "\t" "{:connections-dropped-by-finwait2-timeout/%ju} " "{N:/Connection%s (fin_wait_2) dropped because of timeout}\n"); p(tcps_keeptimeo, "\t{:keepalive-timeout/%ju} " "{N:/keepalive timeout%s}\n"); p(tcps_keepprobe, "\t\t{:keepalive-probes/%ju} " "{N:/keepalive probe%s sent}\n"); p(tcps_keepdrops, "\t\t{:connections-dropped-by-keepalives/%ju} " "{N:/connection%s dropped by keepalive}\n"); p(tcps_progdrops, "\t{:connections-dropped-due-to-progress-time/%ju} " "{N:/connection%s dropped due to exceeding progress time}\n"); p(tcps_predack, "\t{:ack-header-predictions/%ju} " "{N:/correct ACK header prediction%s}\n"); p(tcps_preddat, "\t{:data-packet-header-predictions/%ju} " "{N:/correct data packet header prediction%s}\n"); xo_open_container("syncache"); p3(tcps_sc_added, "\t{:entries-added/%ju} " "{N:/syncache entr%s added}\n"); p1a(tcps_sc_retransmitted, "\t\t{:retransmitted/%ju} " "{N:/retransmitted}\n"); p1a(tcps_sc_dupsyn, "\t\t{:duplicates/%ju} {N:/dupsyn}\n"); p1a(tcps_sc_dropped, "\t\t{:dropped/%ju} {N:/dropped}\n"); p1a(tcps_sc_completed, "\t\t{:completed/%ju} {N:/completed}\n"); p1a(tcps_sc_bucketoverflow, "\t\t{:bucket-overflow/%ju} " "{N:/bucket overflow}\n"); p1a(tcps_sc_cacheoverflow, "\t\t{:cache-overflow/%ju} " "{N:/cache overflow}\n"); p1a(tcps_sc_reset, "\t\t{:reset/%ju} {N:/reset}\n"); p1a(tcps_sc_stale, "\t\t{:stale/%ju} {N:/stale}\n"); p1a(tcps_sc_aborted, "\t\t{:aborted/%ju} {N:/aborted}\n"); p1a(tcps_sc_badack, "\t\t{:bad-ack/%ju} {N:/badack}\n"); p1a(tcps_sc_unreach, "\t\t{:unreachable/%ju} {N:/unreach}\n"); p(tcps_sc_zonefail, "\t\t{:zone-failures/%ju} {N:/zone failure%s}\n"); p(tcps_sc_sendcookie, "\t{:sent-cookies/%ju} {N:/cookie%s sent}\n"); p(tcps_sc_recvcookie, "\t{:receivd-cookies/%ju} " "{N:/cookie%s received}\n"); xo_close_container("syncache"); xo_open_container("hostcache"); p3(tcps_hc_added, "\t{:entries-added/%ju} " "{N:/hostcache entr%s added}\n"); p1a(tcps_hc_bucketoverflow, "\t\t{:buffer-overflows/%ju} " "{N:/bucket overflow}\n"); xo_close_container("hostcache"); xo_open_container("sack"); p(tcps_sack_recovery_episode, "\t{:recovery-episodes/%ju} " "{N:/SACK recovery episode%s}\n"); - p(tcps_sack_rexmits, "\t{:segment-retransmits/%ju} " + p(tcps_sack_rexmits, "\t{:segment-retransmits/%ju} " "{N:/segment rexmit%s in SACK recovery episodes}\n"); - p(tcps_sack_rexmit_bytes, "\t{:byte-retransmits/%ju} " + p(tcps_sack_rexmits_tso, "\t{:tso-chunk-retransmits/%ju} " + "{N:/tso chunk rexmit%s in SACK recovery episodes}\n"); + p(tcps_sack_rexmit_bytes, "\t{:byte-retransmits/%ju} " "{N:/byte rexmit%s in SACK recovery episodes}\n"); - p(tcps_sack_rcv_blocks, "\t{:received-blocks/%ju} " + p(tcps_sack_rcv_blocks, "\t{:received-blocks/%ju} " "{N:/SACK option%s (SACK blocks) received}\n"); p(tcps_sack_send_blocks, "\t{:sent-option-blocks/%ju} " "{N:/SACK option%s (SACK blocks) sent}\n"); p(tcps_sack_lostrexmt, "\t{:lost-retransmissions/%ju} " "{N:/SACK retransmission%s lost}\n"); p1a(tcps_sack_sboverflow, "\t{:scoreboard-overflows/%ju} " "{N:/SACK scoreboard overflow}\n"); xo_close_container("sack"); xo_open_container("ecn"); p(tcps_ecn_rcvce, "\t{:received-ce-packets/%ju} " "{N:/packet%s received with ECN CE bit set}\n"); p(tcps_ecn_rcvect0, "\t{:received-ect0-packets/%ju} " "{N:/packet%s received with ECN ECT(0) bit set}\n"); p(tcps_ecn_rcvect1, "\t{:received-ect1-packets/%ju} " "{N:/packet%s received with ECN ECT(1) bit set}\n"); p(tcps_ecn_sndect0, "\t{:sent-ect0-packets/%ju} " "{N:/packet%s sent with ECN ECT(0) bit set}\n"); p(tcps_ecn_sndect1, "\t{:sent-ect1-packets/%ju} " "{N:/packet%s sent with ECN ECT(1) bit set}\n"); p(tcps_ecn_shs, "\t{:handshakes/%ju} " "{N:/successful ECN handshake%s}\n"); p(tcps_ecn_rcwnd, "\t{:congestion-reductions/%ju} " "{N:/time%s ECN reduced the congestion window}\n"); p(tcps_ace_nect, "\t{:ace-nonect-syn/%ju} " "{N:/ACE SYN packet%s with Non-ECT}\n"); p(tcps_ace_ect0, "\t{:ace-ect0-syn/%ju} " "{N:/ACE SYN packet%s with ECT0}\n"); p(tcps_ace_ect1, "\t{:ace-ect1-syn/%ju} " "{N:/ACE SYN packet%s with ECT1}\n"); p(tcps_ace_ce, "\t{:ace-ce-syn/%ju} " "{N:/ACE SYN packet%s with CE}\n"); xo_close_container("ecn"); xo_open_container("tcp-signature"); p(tcps_sig_rcvgoodsig, "\t{:received-good-signature/%ju} " "{N:/packet%s with matching signature received}\n"); p(tcps_sig_rcvbadsig, "\t{:received-bad-signature/%ju} " "{N:/packet%s with bad signature received}\n"); p(tcps_sig_err_buildsig, "\t{:failed-make-signature/%ju} " "{N:/time%s failed to make signature due to no SA}\n"); p(tcps_sig_err_sigopt, "\t{:no-signature-expected/%ju} " "{N:/time%s unexpected signature received}\n"); p(tcps_sig_err_nosigopt, "\t{:no-signature-provided/%ju} " "{N:/time%s no signature provided by segment}\n"); xo_close_container("tcp-signature"); xo_open_container("pmtud"); p(tcps_pmtud_blackhole_activated, "\t{:pmtud-activated/%ju} " "{N:/Path MTU discovery black hole detection activation%s}\n"); p(tcps_pmtud_blackhole_activated_min_mss, "\t{:pmtud-activated-min-mss/%ju} " "{N:/Path MTU discovery black hole detection min MSS activation%s}\n"); p(tcps_pmtud_blackhole_failed, "\t{:pmtud-failed/%ju} " "{N:/Path MTU discovery black hole detection failure%s}\n"); xo_close_container("pmtud"); xo_open_container("tw"); p(tcps_tw_responds, "\t{:tw_responds/%ju} " "{N:/time%s connection in TIME-WAIT responded with ACK}\n"); p(tcps_tw_recycles, "\t{:tw_recycles/%ju} " "{N:/time%s connection in TIME-WAIT was actively recycled}\n"); p(tcps_tw_resets, "\t{:tw_resets/%ju} " "{N:/time%s connection in TIME-WAIT responded with RST}\n"); xo_close_container("tw"); #undef p #undef p1a #undef p2 #undef p2a #undef p3 xo_open_container("TCP connection count by state"); xo_emit("{T:/TCP connection count by state}:\n"); for (int i = 0; i < TCP_NSTATES; i++) { /* * XXXGL: is there a way in libxo to use %s * in the "content string" of a format * string? I failed to do that, that's why * a temporary buffer is used to construct * format string for xo_emit(). */ char fmtbuf[80]; if (sflag > 1 && tcps_states[i] == 0) continue; snprintf(fmtbuf, sizeof(fmtbuf), "\t{:%s/%%ju} " "{Np:/connection ,connections} in %s state\n", tcpstates[i], tcpstates[i]); xo_emit(fmtbuf, (uintmax_t )tcps_states[i]); } xo_close_container("TCP connection count by state"); xo_close_container("tcp"); } /* * Dump UDP statistics structure. */ void udp_stats(u_long off, const char *name, int af1 __unused, int proto __unused) { struct udpstat udpstat; uint64_t delivered; #ifdef INET6 if (udp_done != 0) return; else udp_done = 1; #endif if (fetch_stats("net.inet.udp.stats", off, &udpstat, sizeof(udpstat), kread_counters) != 0) return; xo_open_container("udp"); xo_emit("{T:/%s}:\n", name); #define p(f, m) if (udpstat.f || sflag <= 1) \ xo_emit("\t" m, (uintmax_t)udpstat.f, plural(udpstat.f)) #define p1a(f, m) if (udpstat.f || sflag <= 1) \ xo_emit("\t" m, (uintmax_t)udpstat.f) p(udps_ipackets, "{:received-datagrams/%ju} " "{N:/datagram%s received}\n"); p1a(udps_hdrops, "{:dropped-incomplete-headers/%ju} " "{N:/with incomplete header}\n"); p1a(udps_badlen, "{:dropped-bad-data-length/%ju} " "{N:/with bad data length field}\n"); p1a(udps_badsum, "{:dropped-bad-checksum/%ju} " "{N:/with bad checksum}\n"); p1a(udps_nosum, "{:dropped-no-checksum/%ju} " "{N:/with no checksum}\n"); p1a(udps_noport, "{:dropped-no-socket/%ju} " "{N:/dropped due to no socket}\n"); p(udps_noportbcast, "{:dropped-broadcast-multicast/%ju} " "{N:/broadcast\\/multicast datagram%s undelivered}\n"); p1a(udps_fullsock, "{:dropped-full-socket-buffer/%ju} " "{N:/dropped due to full socket buffers}\n"); p1a(udpps_pcbhashmiss, "{:not-for-hashed-pcb/%ju} " "{N:/not for hashed pcb}\n"); delivered = udpstat.udps_ipackets - udpstat.udps_hdrops - udpstat.udps_badlen - udpstat.udps_badsum - udpstat.udps_noport - udpstat.udps_noportbcast - udpstat.udps_fullsock; if (delivered || sflag <= 1) xo_emit("\t{:delivered-packets/%ju} {N:/delivered}\n", (uint64_t)delivered); p(udps_opackets, "{:output-packets/%ju} {N:/datagram%s output}\n"); /* the next statistic is cumulative in udps_noportbcast */ p(udps_filtermcast, "{:multicast-source-filter-matches/%ju} " "{N:/time%s multicast source filter matched}\n"); #undef p #undef p1a xo_close_container("udp"); } /* * Dump CARP statistics structure. */ void carp_stats(u_long off, const char *name, int af1 __unused, int proto __unused) { struct carpstats carpstat; if (fetch_stats("net.inet.carp.stats", off, &carpstat, sizeof(carpstat), kread_counters) != 0) return; xo_open_container(name); xo_emit("{T:/%s}:\n", name); #define p(f, m) if (carpstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t)carpstat.f, plural(carpstat.f)) #define p2(f, m) if (carpstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t)carpstat.f) p(carps_ipackets, "\t{:received-inet-packets/%ju} " "{N:/packet%s received (IPv4)}\n"); p(carps_ipackets6, "\t{:received-inet6-packets/%ju} " "{N:/packet%s received (IPv6)}\n"); p(carps_badttl, "\t\t{:dropped-wrong-ttl/%ju} " "{N:/packet%s discarded for wrong TTL}\n"); p(carps_hdrops, "\t\t{:dropped-short-header/%ju} " "{N:/packet%s shorter than header}\n"); p(carps_badsum, "\t\t{:dropped-bad-checksum/%ju} " "{N:/discarded for bad checksum%s}\n"); p(carps_badver, "\t\t{:dropped-bad-version/%ju} " "{N:/discarded packet%s with a bad version}\n"); p2(carps_badlen, "\t\t{:dropped-short-packet/%ju} " "{N:/discarded because packet too short}\n"); p2(carps_badauth, "\t\t{:dropped-bad-authentication/%ju} " "{N:/discarded for bad authentication}\n"); p2(carps_badvhid, "\t\t{:dropped-bad-vhid/%ju} " "{N:/discarded for bad vhid}\n"); p2(carps_badaddrs, "\t\t{:dropped-bad-address-list/%ju} " "{N:/discarded because of a bad address list}\n"); p(carps_opackets, "\t{:sent-inet-packets/%ju} " "{N:/packet%s sent (IPv4)}\n"); p(carps_opackets6, "\t{:sent-inet6-packets/%ju} " "{N:/packet%s sent (IPv6)}\n"); p2(carps_onomem, "\t\t{:send-failed-memory-error/%ju} " "{N:/send failed due to mbuf memory error}\n"); #if notyet p(carps_ostates, "\t\t{:send-state-updates/%s} " "{N:/state update%s sent}\n"); #endif #undef p #undef p2 xo_close_container(name); } /* * Dump IP statistics structure. */ void ip_stats(u_long off, const char *name, int af1 __unused, int proto __unused) { struct ipstat ipstat; if (fetch_stats("net.inet.ip.stats", off, &ipstat, sizeof(ipstat), kread_counters) != 0) return; xo_open_container(name); xo_emit("{T:/%s}:\n", name); #define p(f, m) if (ipstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t )ipstat.f, plural(ipstat.f)) #define p1a(f, m) if (ipstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t )ipstat.f) p(ips_total, "\t{:received-packets/%ju} " "{N:/total packet%s received}\n"); p(ips_badsum, "\t{:dropped-bad-checksum/%ju} " "{N:/bad header checksum%s}\n"); p1a(ips_toosmall, "\t{:dropped-below-minimum-size/%ju} " "{N:/with size smaller than minimum}\n"); p1a(ips_tooshort, "\t{:dropped-short-packets/%ju} " "{N:/with data size < data length}\n"); p1a(ips_toolong, "\t{:dropped-too-long/%ju} " "{N:/with ip length > max ip packet size}\n"); p1a(ips_badhlen, "\t{:dropped-short-header-length/%ju} " "{N:/with header length < data size}\n"); p1a(ips_badlen, "\t{:dropped-short-data/%ju} " "{N:/with data length < header length}\n"); p1a(ips_badoptions, "\t{:dropped-bad-options/%ju} " "{N:/with bad options}\n"); p1a(ips_badvers, "\t{:dropped-bad-version/%ju} " "{N:/with incorrect version number}\n"); p(ips_fragments, "\t{:received-fragments/%ju} " "{N:/fragment%s received}\n"); p(ips_fragdropped, "\t{:dropped-fragments/%ju} " "{N:/fragment%s dropped (dup or out of space)}\n"); p(ips_fragtimeout, "\t{:dropped-fragments-after-timeout/%ju} " "{N:/fragment%s dropped after timeout}\n"); p(ips_reassembled, "\t{:reassembled-packets/%ju} " "{N:/packet%s reassembled ok}\n"); p(ips_delivered, "\t{:received-local-packets/%ju} " "{N:/packet%s for this host}\n"); p(ips_noproto, "\t{:dropped-unknown-protocol/%ju} " "{N:/packet%s for unknown\\/unsupported protocol}\n"); p(ips_forward, "\t{:forwarded-packets/%ju} " "{N:/packet%s forwarded}"); p(ips_fastforward, " ({:fast-forwarded-packets/%ju} " "{N:/packet%s fast forwarded})"); if (ipstat.ips_forward || sflag <= 1) xo_emit("\n"); p(ips_cantforward, "\t{:packets-cannot-forward/%ju} " "{N:/packet%s not forwardable}\n"); p(ips_notmember, "\t{:received-unknown-multicast-group/%ju} " "{N:/packet%s received for unknown multicast group}\n"); p(ips_redirectsent, "\t{:redirects-sent/%ju} " "{N:/redirect%s sent}\n"); p(ips_localout, "\t{:sent-packets/%ju} " "{N:/packet%s sent from this host}\n"); p(ips_rawout, "\t{:send-packets-fabricated-header/%ju} " "{N:/packet%s sent with fabricated ip header}\n"); p(ips_odropped, "\t{:discard-no-mbufs/%ju} " "{N:/output packet%s dropped due to no bufs, etc.}\n"); p(ips_noroute, "\t{:discard-no-route/%ju} " "{N:/output packet%s discarded due to no route}\n"); p(ips_fragmented, "\t{:sent-fragments/%ju} " "{N:/output datagram%s fragmented}\n"); p(ips_ofragments, "\t{:fragments-created/%ju} " "{N:/fragment%s created}\n"); p(ips_cantfrag, "\t{:discard-cannot-fragment/%ju} " "{N:/datagram%s that can't be fragmented}\n"); p(ips_nogif, "\t{:discard-tunnel-no-gif/%ju} " "{N:/tunneling packet%s that can't find gif}\n"); p(ips_badaddr, "\t{:discard-bad-address/%ju} " "{N:/datagram%s with bad address in header}\n"); #undef p #undef p1a xo_close_container(name); } /* * Dump ARP statistics structure. */ void arp_stats(u_long off, const char *name, int af1 __unused, int proto __unused) { struct arpstat arpstat; if (fetch_stats("net.link.ether.arp.stats", off, &arpstat, sizeof(arpstat), kread_counters) != 0) return; xo_open_container(name); xo_emit("{T:/%s}:\n", name); #define p(f, m) if (arpstat.f || sflag <= 1) \ xo_emit("\t" m, (uintmax_t)arpstat.f, plural(arpstat.f)) #define p2(f, m) if (arpstat.f || sflag <= 1) \ xo_emit("\t" m, (uintmax_t)arpstat.f, pluralies(arpstat.f)) p(txrequests, "{:sent-requests/%ju} {N:/ARP request%s sent}\n"); p(txerrors, "{:sent-failures/%ju} {N:/ARP request%s failed to sent}\n"); p2(txreplies, "{:sent-replies/%ju} {N:/ARP repl%s sent}\n"); p(rxrequests, "{:received-requests/%ju} " "{N:/ARP request%s received}\n"); p2(rxreplies, "{:received-replies/%ju} " "{N:/ARP repl%s received}\n"); p(received, "{:received-packets/%ju} " "{N:/ARP packet%s received}\n"); p(dropped, "{:dropped-no-entry/%ju} " "{N:/total packet%s dropped due to no ARP entry}\n"); p(timeouts, "{:entries-timeout/%ju} " "{N:/ARP entry%s timed out}\n"); p(dupips, "{:dropped-duplicate-address/%ju} " "{N:/Duplicate IP%s seen}\n"); #undef p #undef p2 xo_close_container(name); } static const char *icmpnames[ICMP_MAXTYPE + 1] = { "echo reply", /* RFC 792 */ "#1", "#2", "destination unreachable", /* RFC 792 */ "source quench", /* RFC 792 */ "routing redirect", /* RFC 792 */ "#6", "#7", "echo", /* RFC 792 */ "router advertisement", /* RFC 1256 */ "router solicitation", /* RFC 1256 */ "time exceeded", /* RFC 792 */ "parameter problem", /* RFC 792 */ "time stamp", /* RFC 792 */ "time stamp reply", /* RFC 792 */ "information request", /* RFC 792 */ "information request reply", /* RFC 792 */ "address mask request", /* RFC 950 */ "address mask reply", /* RFC 950 */ "#19", "#20", "#21", "#22", "#23", "#24", "#25", "#26", "#27", "#28", "#29", "icmp traceroute", /* RFC 1393 */ "datagram conversion error", /* RFC 1475 */ "mobile host redirect", "IPv6 where-are-you", "IPv6 i-am-here", "mobile registration req", "mobile registration reply", "domain name request", /* RFC 1788 */ "domain name reply", /* RFC 1788 */ "icmp SKIP", "icmp photuris", /* RFC 2521 */ }; /* * Dump ICMP statistics. */ void icmp_stats(u_long off, const char *name, int af1 __unused, int proto __unused) { struct icmpstat icmpstat; size_t len; int i, first; if (fetch_stats("net.inet.icmp.stats", off, &icmpstat, sizeof(icmpstat), kread_counters) != 0) return; xo_open_container(name); xo_emit("{T:/%s}:\n", name); #define p(f, m) if (icmpstat.f || sflag <= 1) \ xo_emit(m, icmpstat.f, plural(icmpstat.f)) #define p1a(f, m) if (icmpstat.f || sflag <= 1) \ xo_emit(m, icmpstat.f) #define p2(f, m) if (icmpstat.f || sflag <= 1) \ xo_emit(m, icmpstat.f, plurales(icmpstat.f)) p(icps_error, "\t{:icmp-calls/%lu} " "{N:/call%s to icmp_error}\n"); p(icps_oldicmp, "\t{:errors-not-from-message/%lu} " "{N:/error%s not generated in response to an icmp message}\n"); for (first = 1, i = 0; i < ICMP_MAXTYPE + 1; i++) { if (icmpstat.icps_outhist[i] != 0) { if (first) { xo_open_list("output-histogram"); xo_emit("\tOutput histogram:\n"); first = 0; } xo_open_instance("output-histogram"); if (icmpnames[i] != NULL) xo_emit("\t\t{k:name/%s}: {:count/%lu}\n", icmpnames[i], icmpstat.icps_outhist[i]); else xo_emit("\t\tunknown ICMP #{k:name/%d}: " "{:count/%lu}\n", i, icmpstat.icps_outhist[i]); xo_close_instance("output-histogram"); } } if (!first) xo_close_list("output-histogram"); p(icps_badcode, "\t{:dropped-bad-code/%lu} " "{N:/message%s with bad code fields}\n"); p(icps_tooshort, "\t{:dropped-too-short/%lu} " "{N:/message%s less than the minimum length}\n"); p(icps_checksum, "\t{:dropped-bad-checksum/%lu} " "{N:/message%s with bad checksum}\n"); p(icps_badlen, "\t{:dropped-bad-length/%lu} " "{N:/message%s with bad length}\n"); p1a(icps_bmcastecho, "\t{:dropped-multicast-echo/%lu} " "{N:/multicast echo requests ignored}\n"); p1a(icps_bmcasttstamp, "\t{:dropped-multicast-timestamp/%lu} " "{N:/multicast timestamp requests ignored}\n"); for (first = 1, i = 0; i < ICMP_MAXTYPE + 1; i++) { if (icmpstat.icps_inhist[i] != 0) { if (first) { xo_open_list("input-histogram"); xo_emit("\tInput histogram:\n"); first = 0; } xo_open_instance("input-histogram"); if (icmpnames[i] != NULL) xo_emit("\t\t{k:name/%s}: {:count/%lu}\n", icmpnames[i], icmpstat.icps_inhist[i]); else xo_emit( "\t\tunknown ICMP #{k:name/%d}: {:count/%lu}\n", i, icmpstat.icps_inhist[i]); xo_close_instance("input-histogram"); } } if (!first) xo_close_list("input-histogram"); p(icps_reflect, "\t{:sent-packets/%lu} " "{N:/message response%s generated}\n"); p2(icps_badaddr, "\t{:discard-invalid-return-address/%lu} " "{N:/invalid return address%s}\n"); p(icps_noroute, "\t{:discard-no-route/%lu} " "{N:/no return route%s}\n"); #undef p #undef p1a #undef p2 if (live) { len = sizeof i; if (sysctlbyname("net.inet.icmp.maskrepl", &i, &len, NULL, 0) < 0) return; xo_emit("\tICMP address mask responses are " "{q:icmp-address-responses/%sabled}\n", i ? "en" : "dis"); } xo_close_container(name); } /* * Dump IGMP statistics structure. */ void igmp_stats(u_long off, const char *name, int af1 __unused, int proto __unused) { struct igmpstat igmpstat; int error, zflag0; if (fetch_stats("net.inet.igmp.stats", 0, &igmpstat, sizeof(igmpstat), kread) != 0) return; /* * Reread net.inet.igmp.stats when zflag == 1. * This is because this MIB contains version number and * length of the structure which are not set when clearing * the counters. */ zflag0 = zflag; if (zflag) { zflag = 0; error = fetch_stats("net.inet.igmp.stats", 0, &igmpstat, sizeof(igmpstat), kread); zflag = zflag0; if (error) return; } if (igmpstat.igps_version != IGPS_VERSION_3) { xo_warnx("%s: version mismatch (%d != %d)", __func__, igmpstat.igps_version, IGPS_VERSION_3); return; } if (igmpstat.igps_len != IGPS_VERSION3_LEN) { xo_warnx("%s: size mismatch (%d != %d)", __func__, igmpstat.igps_len, IGPS_VERSION3_LEN); return; } xo_open_container(name); xo_emit("{T:/%s}:\n", name); #define p64(f, m) if (igmpstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t) igmpstat.f, plural(igmpstat.f)) #define py64(f, m) if (igmpstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t) igmpstat.f, pluralies(igmpstat.f)) p64(igps_rcv_total, "\t{:received-messages/%ju} " "{N:/message%s received}\n"); p64(igps_rcv_tooshort, "\t{:dropped-too-short/%ju} " "{N:/message%s received with too few bytes}\n"); p64(igps_rcv_badttl, "\t{:dropped-wrong-ttl/%ju} " "{N:/message%s received with wrong TTL}\n"); p64(igps_rcv_badsum, "\t{:dropped-bad-checksum/%ju} " "{N:/message%s received with bad checksum}\n"); py64(igps_rcv_v1v2_queries, "\t{:received-membership-queries/%ju} " "{N:/V1\\/V2 membership quer%s received}\n"); py64(igps_rcv_v3_queries, "\t{:received-v3-membership-queries/%ju} " "{N:/V3 membership quer%s received}\n"); py64(igps_rcv_badqueries, "\t{:dropped-membership-queries/%ju} " "{N:/membership quer%s received with invalid field(s)}\n"); py64(igps_rcv_gen_queries, "\t{:received-general-queries/%ju} " "{N:/general quer%s received}\n"); py64(igps_rcv_group_queries, "\t{:received-group-queries/%ju} " "{N:/group quer%s received}\n"); py64(igps_rcv_gsr_queries, "\t{:received-group-source-queries/%ju} " "{N:/group-source quer%s received}\n"); py64(igps_drop_gsr_queries, "\t{:dropped-group-source-queries/%ju} " "{N:/group-source quer%s dropped}\n"); p64(igps_rcv_reports, "\t{:received-membership-requests/%ju} " "{N:/membership report%s received}\n"); p64(igps_rcv_badreports, "\t{:dropped-membership-reports/%ju} " "{N:/membership report%s received with invalid field(s)}\n"); p64(igps_rcv_ourreports, "\t" "{:received-membership-reports-matching/%ju} " "{N:/membership report%s received for groups to which we belong}" "\n"); p64(igps_rcv_nora, "\t{:received-v3-reports-no-router-alert/%ju} " "{N:/V3 report%s received without Router Alert}\n"); p64(igps_snd_reports, "\t{:sent-membership-reports/%ju} " "{N:/membership report%s sent}\n"); #undef p64 #undef py64 xo_close_container(name); } /* * Dump PIM statistics structure. */ void pim_stats(u_long off __unused, const char *name, int af1 __unused, int proto __unused) { struct pimstat pimstat; if (fetch_stats("net.inet.pim.stats", off, &pimstat, sizeof(pimstat), kread_counters) != 0) return; xo_open_container(name); xo_emit("{T:/%s}:\n", name); #define p(f, m) if (pimstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t)pimstat.f, plural(pimstat.f)) #define py(f, m) if (pimstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t)pimstat.f, pimstat.f != 1 ? "ies" : "y") p(pims_rcv_total_msgs, "\t{:received-messages/%ju} " "{N:/message%s received}\n"); p(pims_rcv_total_bytes, "\t{:received-bytes/%ju} " "{N:/byte%s received}\n"); p(pims_rcv_tooshort, "\t{:dropped-too-short/%ju} " "{N:/message%s received with too few bytes}\n"); p(pims_rcv_badsum, "\t{:dropped-bad-checksum/%ju} " "{N:/message%s received with bad checksum}\n"); p(pims_rcv_badversion, "\t{:dropped-bad-version/%ju} " "{N:/message%s received with bad version}\n"); p(pims_rcv_registers_msgs, "\t{:received-data-register-messages/%ju} " "{N:/data register message%s received}\n"); p(pims_rcv_registers_bytes, "\t{:received-data-register-bytes/%ju} " "{N:/data register byte%s received}\n"); p(pims_rcv_registers_wrongiif, "\t" "{:received-data-register-wrong-interface/%ju} " "{N:/data register message%s received on wrong iif}\n"); p(pims_rcv_badregisters, "\t{:received-bad-registers/%ju} " "{N:/bad register%s received}\n"); p(pims_snd_registers_msgs, "\t{:sent-data-register-messages/%ju} " "{N:/data register message%s sent}\n"); p(pims_snd_registers_bytes, "\t{:sent-data-register-bytes/%ju} " "{N:/data register byte%s sent}\n"); #undef p #undef py xo_close_container(name); } /* * Dump divert(4) statistics structure. */ void divert_stats(u_long off, const char *name, int af1 __unused, int proto __unused) { struct divstat divstat; if (fetch_stats("net.inet.divert.stats", off, &divstat, sizeof(divstat), kread_counters) != 0) return; xo_open_container(name); xo_emit("{T:/%s}:\n", name); #define p(f, m) if (divstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t)divstat.f, plural(divstat.f)) p(div_diverted, "\t{:diverted-packets/%ju} " "{N:/packet%s successfully diverted to userland}\n"); p(div_noport, "\t{:noport-fails/%ju} " "{N:/packet%s failed to divert due to no socket bound at port}\n"); p(div_outbound, "\t{:outbound-packets/%ju} " "{N:/packet%s successfully re-injected as outbound}\n"); p(div_inbound, "\t{:inbound-packets/%ju} " "{N:/packet%s successfully re-injected as inbound}\n"); #undef p xo_close_container(name); } #ifdef INET /* * Pretty print an Internet address (net address + port). */ static void inetprint(const char *container, struct in_addr *in, int port, const char *proto, int num_port, const int af1) { struct servent *sp = 0; char line[80], *cp; int width; size_t alen, plen; if (container) xo_open_container(container); if (Wflag) snprintf(line, sizeof(line), "%s.", inetname(in)); else snprintf(line, sizeof(line), "%.*s.", (Aflag && !num_port) ? 12 : 16, inetname(in)); alen = strlen(line); cp = line + alen; if (!num_port && port) sp = getservbyport((int)port, proto); if (sp || port == 0) snprintf(cp, sizeof(line) - alen, "%.15s ", sp ? sp->s_name : "*"); else snprintf(cp, sizeof(line) - alen, "%d ", ntohs((u_short)port)); width = (Aflag && !Wflag) ? 18 : ((!Wflag || af1 == AF_INET) ? 22 : 45); if (Wflag) xo_emit("{d:target/%-*s} ", width, line); else xo_emit("{d:target/%-*.*s} ", width, width, line); plen = strlen(cp) - 1; alen--; xo_emit("{e:address/%*.*s}{e:port/%*.*s}", alen, alen, line, plen, plen, cp); if (container) xo_close_container(container); } /* * Construct an Internet address representation. * If numeric_addr has been supplied, give * numeric value, otherwise try for symbolic name. */ char * inetname(struct in_addr *inp) { char *cp; static char line[MAXHOSTNAMELEN]; struct hostent *hp; cp = 0; if (!numeric_addr && inp->s_addr != INADDR_ANY) { hp = gethostbyaddr((char *)inp, sizeof (*inp), AF_INET); if (hp) { cp = hp->h_name; trimdomain(cp, strlen(cp)); } } if (inp->s_addr == INADDR_ANY) strcpy(line, "*"); else if (cp) { strlcpy(line, cp, sizeof(line)); } else { inp->s_addr = ntohl(inp->s_addr); #define C(x) ((u_int)((x) & 0xff)) snprintf(line, sizeof(line), "%u.%u.%u.%u", C(inp->s_addr >> 24), C(inp->s_addr >> 16), C(inp->s_addr >> 8), C(inp->s_addr)); } return (line); } #endif