diff --git a/cddl/lib/libdtrace/siftr.d b/cddl/lib/libdtrace/siftr.d index 791693db7638..ecb307822671 100644 --- a/cddl/lib/libdtrace/siftr.d +++ b/cddl/lib/libdtrace/siftr.d @@ -1,99 +1,110 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END * * $FreeBSD$ */ +#pragma D depends_on library ip.d #pragma D depends_on module kernel #pragma D depends_on module siftr #pragma D depends_on provider tcp /* * Convert a SIFTR direction value to a string */ #pragma D binding "1.12.1" SIFTR_IN inline int SIFTR_IN = 0; #pragma D binding "1.12.1" SIFTR_OUT inline int SIFTR_OUT = 1; /* SIFTR direction strings. */ #pragma D binding "1.12.1" siftr_dir_string inline string siftr_dir_string[uint8_t direction] = direction == SIFTR_IN ? "in" : direction == SIFTR_OUT ? "out" : "unknown" ; typedef struct siftrinfo { struct timeval tval; uint8_t direction; uint8_t ipver; - uint16_t tcp_localport; - uint16_t tcp_foreignport; + uint16_t lport; + uint16_t rport; + string laddr; + string raddr; uint32_t snd_cwnd; uint32_t snd_wnd; uint32_t rcv_wnd; uint32_t t_flags2; uint32_t snd_ssthresh; int conn_state; - u_int max_seg_size; + uint32_t mss; uint32_t srtt; u_char sack_enabled; u_char snd_scale; u_char rcv_scale; - u_int flags; + u_int t_flags; uint32_t rto; u_int snd_buf_hiwater; u_int snd_buf_cc; u_int rcv_buf_hiwater; u_int rcv_buf_cc; u_int sent_inflight_bytes; int t_segqlen; u_int flowid; u_int flowtype; } siftrinfo_t; #pragma D binding "1.12.1" translator translator siftrinfo_t < struct pkt_node *p > { direction = p == NULL ? 0 : p->direction; ipver = p == NULL ? 0 : p->ipver; - tcp_localport = p == NULL ? 0 : ntohs(p->tcp_localport); - tcp_foreignport = p == NULL ? 0 : ntohs(p->tcp_foreignport); + lport = p == NULL ? 0 : ntohs(p->lport); + rport = p == NULL ? 0 : ntohs(p->fport); + laddr = p == NULL ? "" : + p->ipver == INP_IPV4 ? + inet_ntoa(&p->laddr.id46_addr.ia46_addr4.s_addr) : + inet_ntoa6(&p->laddr.id6_addr); + raddr = p == NULL ? "" : + p->ipver == INP_IPV4 ? + inet_ntoa(&p->faddr.id46_addr.ia46_addr4.s_addr) : + inet_ntoa6(&p->faddr.id6_addr); snd_cwnd = p == NULL ? 0 : p->snd_cwnd; snd_wnd = p == NULL ? 0 : p->snd_wnd; rcv_wnd = p == NULL ? 0 : p->rcv_wnd; t_flags2 = p == NULL ? 0 : p->t_flags2; snd_ssthresh = p == NULL ? 0 : p->snd_ssthresh; conn_state = p == NULL ? 0 : p->conn_state; - max_seg_size = p == NULL ? 0 : p->max_seg_size; + mss = p == NULL ? 0 : p->mss; srtt = p == NULL ? 0 : p->srtt; sack_enabled = p == NULL ? 0 : p->sack_enabled; snd_scale = p == NULL ? 0 : p->snd_scale; rcv_scale = p == NULL ? 0 : p->rcv_scale; - flags = p == NULL ? 0 : p->flags; + t_flags = p == NULL ? 0 : p->t_flags; rto = p == NULL ? 0 : p->rto; snd_buf_hiwater = p == NULL ? 0 : p->snd_buf_hiwater; snd_buf_cc = p == NULL ? 0 : p->snd_buf_cc; rcv_buf_hiwater = p == NULL ? 0 : p->rcv_buf_hiwater; rcv_buf_cc = p == NULL ? 0 : p->rcv_buf_cc; sent_inflight_bytes = p == NULL ? 0 : p->sent_inflight_bytes; t_segqlen = p == NULL ? 0 : p->t_segqlen; flowid = p == NULL ? 0 : p->flowid; flowtype = p == NULL ? 0 : p->flowtype; }; diff --git a/share/man/man4/dtrace_tcp.4 b/share/man/man4/dtrace_tcp.4 index 49dd9449d887..0bb5b79169e4 100644 --- a/share/man/man4/dtrace_tcp.4 +++ b/share/man/man4/dtrace_tcp.4 @@ -1,524 +1,528 @@ .\" Copyright (c) 2015 Mark Johnston .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" -.Dd July 1, 2023 +.Dd July 2, 2023 .Dt DTRACE_TCP 4 .Os .Sh NAME .Nm dtrace_tcp .Nd a DTrace provider for tracing events related to the .Xr tcp 4 protocol .Sh SYNOPSIS .Fn tcp:::accept-established "pktinfo_t *" "csinfo_t *" "ipinfo_t *" \ "tcpsinfo_t *" "tcpinfo_t *" .Fn tcp:::accept-refused "pktinfo_t *" "csinfo_t *" "ipinfo_t *" \ "tcpsinfo_t *" "tcpinfo_t *" .Fn tcp:::connect-established "pktinfo_t *" "csinfo_t *" "ipinfo_t *" \ "tcpsinfo_t *" "tcpinfo_t *" .Fn tcp:::connect-refused "pktinfo_t *" "csinfo_t *" "ipinfo_t *" \ "tcpsinfo_t *" "tcpinfo_t *" .Fn tcp:::connect-request "pktinfo_t *" "csinfo_t *" "ipinfo_t *" \ "tcpsinfo_t *" "tcpinfo_t *" .Fn tcp:::receive "pktinfo_t *" "csinfo_t *" "ipinfo_t *" "tcpsinfo_t *" \ "tcpinfo_t *" .Fn tcp:::send "pktinfo_t *" "csinfo_t *" "ipinfo_t *" "tcpsinfo_t *" \ "tcpinfo_t *" .Fn tcp:::state-change "void *" "csinfo_t *" "void *" "tcpsinfo_t *" "void *" \ "tcplsinfo_t *" .Fn tcp:::siftr "siftrinfo_t *" .Sh DESCRIPTION The DTrace .Nm tcp provider allows users to trace events in the .Xr tcp 4 protocol implementation. This provider is similar to the .Xr dtrace_ip 4 and .Xr dtrace_udp 4 providers, but additionally contains probes corresponding to protocol events at a level higher than packet reception and transmission. All .Nm tcp probes except for .Fn tcp:::state-change and .Fn tcp:::siftr have the same number and type of arguments. The last three arguments are used to describe a TCP segment: the .Vt ipinfo_t argument exposes the version-agnostic fields of the IP header, while the .Vt tcpinfo_t argument exposes the TCP header, and the .Vt tcpsinfo_t argument describes details of the corresponding TCP connection state, if any. Their fields are described in the ARGUMENTS section. .Pp The .Fn tcp:::accept-established probe fires when a remotely-initiated active TCP open succeeds. At this point the new connection is in the ESTABLISHED state, and the probe arguments expose the headers associated with the final ACK of the three-way handshake. The .Fn tcp:::accept-refused probe fires when a SYN arrives on a port without a listening socket. The probe arguments expose the headers associated with the RST to be transmitted to the remote host in response to the SYN segment. .Pp The .Fn tcp:::connect-established , .Fn tcp:::connect-refused , and .Fn tcp:::connect-request probes are similar to the .Ql accept probes, except that they correspond to locally-initiated TCP connections. The .Fn tcp:::connect-established probe fires when the SYN-ACK segment of a three-way handshake is received from the remote host and a final ACK is prepared for transmission. This occurs immediately after the local connection state transitions from SYN-SENT to ESTABLISHED. The probe arguments describe the headers associated with the received SYN-ACK segment. The .Fn tcp:::connect-refused probe fires when the local host receives a RST segment in response to a SYN segment, indicating that the remote host refused to open a connection. The probe arguments describe the IP and TCP headers associated with the received RST segment. The .Fn tcp:::connect-request probe fires as the kernel prepares to transmit the initial SYN segment of a three-way handshake. .Pp The .Fn tcp:::send and .Fn tcp:::receive probes fire when the host sends or receives a TCP packet, respectively. As with the .Xr dtrace_udp 4 provider, .Nm tcp probes fire only for packets sent by or to the local host; forwarded packets are handled in the IP layer and are only visible to the .Xr dtrace_ip 4 provider. .Pp The .Fn tcp:::state-change probe fires upon local TCP connection state transitions. Its first, third and fifth arguments are currently always .Dv NULL . Its last argument describes the from-state in the transition, and the to-state can be obtained from .Dv args[3]->tcps_state . .Pp The .Fn tcp:::siftr probe fires when a TCP segment is sent or received by the host. For a detailed description see .Xr siftr 4 . The .Vt siftrinfo_t argument provides the information about the TCP connection. .Sh ARGUMENTS The .Vt pktinfo_t argument is currently unimplemented and is included for compatibility with other implementations of this provider. Its fields are: .Bl -tag -width "uinptr_t pkt_addr" -offset indent .It Vt uinptr_t pkt_addr Always set to 0. .El .Pp The .Vt csinfo_t argument is currently unimplemented and is included for compatibility with other implementations of this provider. Its fields are: .Bl -tag -width "uintptr_t cs_addr" -offset indent .It Vt uintptr_t cs_addr Always set to 0. .It Vt uint64_t cs_cid A pointer to the .Vt struct inpcb for this packet, or .Dv NULL . .It Vt pid_t cs_pid Always set to 0. .El .Pp The .Vt ipinfo_t type is a version-agnostic representation of fields from an IP header. Its fields are described in the .Xr dtrace_ip 4 manual page. .Pp The .Vt tcpsinfo_t type is used to provide a stable representation of TCP connection state. Some .Nm tcp probes, such as .Fn tcp:::accept-refused , fire in a context where there is no TCP connection; this argument is .Dv NULL in that case. Its fields are: .Bl -tag -width "uint16_t tcps_lport" -offset indent .It Vt uintptr_t tcps_addr The address of the corresponding TCP control block. This is currently a pointer to a .Vt struct tcpcb . .It Vt int tcps_local A boolean indicating whether the connection is local to the host. Currently unimplemented and always set to -1. .It Vt int tcps_active A boolean indicating whether the connection was initiated by the local host. Currently unimplemented and always set to -1. .It Vt uint16_t tcps_lport Local TCP port. .It Vt uint16_t tcps_rport Remote TCP port. .It Vt string tcps_laddr Local address. .It Vt string tcps_raddr Remote address. .It Vt int32_t tcps_state Current TCP state. The valid TCP state values are given by the constants prefixed with .Ql TCPS_ in .Pa /usr/lib/dtrace/tcp.d . .It Vt uint32_t tcps_iss Initial send sequence number. .It Vt uint32_t tcps_suna Initial sequence number of sent but unacknowledged data. .It Vt uint32_t tcps_snxt Next sequence number for send. .It Vt uint32_t tcps_rack Sequence number of received and acknowledged data. .It Vt uint32_t tcps_rnxt Next expected sequence number for receive. .It Vt u_long tcps_swnd TCP send window size. .It Vt int32_t tcps_snd_ws Window scaling factor for the TCP send window. .It Vt u_long tcps_rwnd TCP receive window size. .It Vt int32_t tcps_rcv_ws Window scaling factor for the TCP receive window. .It Vt u_long tcps_cwnd TCP congestion window size. .It Vt u_long tcps_cwnd_ssthresh Congestion window threshold at which slow start ends and congestion avoidance begins. .It Vt uint32_t tcps_sack_fack Last sequence number selectively acknowledged by the receiver. .It Vt uint32_t tcps_sack_snxt Next selectively acknowledge sequence number at which to begin retransmitting. .It Vt uint32_t tcps_rto Round-trip timeout, in milliseconds. .It Vt uint32_t tcps_mss Maximum segment size. .It Vt int tcps_retransmit A boolean indicating that the local sender is retransmitting data. .It Vt int tcps_srtt Smoothed round-trip time. .El .Pp The .Vt tcpinfo_t type exposes the fields in a TCP segment header in host order. Its fields are: .Bl -tag -width "struct tcphdr *tcp_hdr" -offset indent .It Vt uint16_t tcp_sport Source TCP port. .It Vt uint16_t tcp_dport Destination TCP port. .It Vt uint32_t tcp_seq Sequence number. .It Vt uint32_t tcp_ack Acknowledgement number. .It Vt uint8_t tcp_offset Data offset, in bytes. .It Vt uint8_t tcp_flags TCP flags. .It Vt uint16_t tcp_window TCP window size. .It Vt uint16_t tcp_checksum Checksum. .It Vt uint16_t tcp_urgent Urgent data pointer. .It Vt struct tcphdr *tcp_hdr A pointer to the raw TCP header. .El .Pp The .Vt tcplsinfo_t type is used by the .Fn tcp:::state-change probe to provide the from-state of a transition. Its fields are: .Bl -tag -width "int32_t tcps_state" -offset indent .It Vt int32_t tcps_state A TCP state. The valid TCP state values are given by the constants prefixed with .Ql TCPS_ in .Pa /usr/lib/dtrace/tcp.d . .El .Pp The .Vt siftrinfo_t type is used by the .Fn tcp:::siftr probe to provide the state of the TCP connection. Its fields are: .Bl -tag -width "u_int sent_inflight_bytes" -offset indent .It Vt uint8_t direction Direction of packet that triggered the log message. Either .Qq 0 for in, or .Qq 1 for out. .It Vt uint8_t ipver The version of the IP protocol being used. Either .Qq 1 for IPv4, or .Qq 2 for IPv6. -.It Vt uint16_t tcp_localport +.It Vt uint16_t lport The TCP port that the local host is communicating via. -.It Vt uint16_t tcp_foreignport -The TCP port that the foreign host is communicating via. +.It Vt uint16_t rport +The TCP port that the remote host is communicating via. +.It Vt string laddr +The IPv4 or IPv6 address of the local host. +.It Vt string raddr +The IPv4 or IPv6 address of the remote host. .It Vt uint32_t snd_cwnd The current congestion window (CWND) for the flow, in bytes. .It Vt uint32_t snd_wnd The current sending window for the flow, in bytes. The post scaled value is reported, except during the initial handshake (first few packets), during which time the unscaled value is reported. .It Vt uint32_t rcv_wnd The current receive window for the flow, in bytes. The post scaled value is always reported. .It Vt uint32_t t_flags2 The current value of the t_flags2 for the flow. .It Vt uint32_t snd_ssthresh The slow start threshold (SSTHRESH) for the flow, in bytes. .It Vt int conn_state A TCP state. The valid TCP state values are given by the constants prefixed with .Ql TCPS_ in .Pa /usr/lib/dtrace/tcp.d . -.It Vt u_int max_seg_size -The maximum segment size for the flow, in bytes. +.It Vt uint32_t mss +The maximum segment size (MSS) for the flow, in bytes. .It Vt uint32_t srtt The current smoothed RTT (SRTT) for the flow in microseconds. .It Vt u_char sack_enabled SACK enabled indicator. 1 if SACK enabled, 0 otherwise. .It Vt u_char snd_scale The current window scaling factor for the sending window. .It Vt u_char rcv_scale The current window scaling factor for the receiving window. -.It Vt u_int flags +.It Vt u_int t_flags The current value of the t_flags for the flow. .It Vt uint32_t rto The current retransmission timeout (RTO) for the flow in microseconds. Divide by HZ to get the timeout length in seconds. .It Vt u_int snd_buf_hiwater The current size of the socket send buffer in bytes. .It Vt u_int snd_buf_cc The current number of bytes in the socket send buffer. .It Vt u_int rcv_buf_hiwater The current size of the socket receive buffer in bytes. .It Vt u_int rcv_buf_cc The current number of bytes in the socket receive buffer. .It Vt u_int sent_inflight_bytes The current number of unacknowledged bytes in-flight. Bytes acknowledged via SACK are not excluded from this count. .It Vt int t_segqlen The current number of segments in the reassembly queue. .It Vt u_int flowid Flowid for the connection. A caveat: Zero '0' either represents a valid flowid or a default value when the flowid is not being set. .It Vt u_int flowtype Flow type for the connection. Flowtype defines which protocol fields are hashed to produce the flowid. A complete listing is available in .Pa /usr/include/sys/mbuf.h under .Dv M_HASHTYPE_* . .El .Sh FILES .Bl -tag -width "/usr/lib/dtrace/siftr.d" -compact .It Pa /usr/lib/dtrace/tcp.d DTrace type and translator definitions for all the probes of the .Nm tcp provider except the .Nm siftr probe. .It Pa /usr/lib/dtrace/siftr.d DTrace type and translator definitions for the .Nm siftr probe of the .Nm tcp provider. .El .Sh EXAMPLES The following script logs TCP segments in real time: .Bd -literal -offset indent #pragma D option quiet #pragma D option switchrate=10hz dtrace:::BEGIN { printf(" %3s %15s:%-5s %15s:%-5s %6s %s\\n", "CPU", "LADDR", "LPORT", "RADDR", "RPORT", "BYTES", "FLAGS"); } tcp:::send { this->length = args[2]->ip_plength - args[4]->tcp_offset; printf(" %3d %16s:%-5d -> %16s:%-5d %6d (", cpu, args[2]->ip_saddr, args[4]->tcp_sport, args[2]->ip_daddr, args[4]->tcp_dport, this->length); printf("%s", args[4]->tcp_flags & TH_FIN ? "FIN|" : ""); printf("%s", args[4]->tcp_flags & TH_SYN ? "SYN|" : ""); printf("%s", args[4]->tcp_flags & TH_RST ? "RST|" : ""); printf("%s", args[4]->tcp_flags & TH_PUSH ? "PUSH|" : ""); printf("%s", args[4]->tcp_flags & TH_ACK ? "ACK|" : ""); printf("%s", args[4]->tcp_flags & TH_URG ? "URG|" : ""); printf("%s", args[4]->tcp_flags == 0 ? "null " : ""); printf("\\b)\\n"); } tcp:::receive { this->length = args[2]->ip_plength - args[4]->tcp_offset; printf(" %3d %16s:%-5d <- %16s:%-5d %6d (", cpu, args[2]->ip_daddr, args[4]->tcp_dport, args[2]->ip_saddr, args[4]->tcp_sport, this->length); printf("%s", args[4]->tcp_flags & TH_FIN ? "FIN|" : ""); printf("%s", args[4]->tcp_flags & TH_SYN ? "SYN|" : ""); printf("%s", args[4]->tcp_flags & TH_RST ? "RST|" : ""); printf("%s", args[4]->tcp_flags & TH_PUSH ? "PUSH|" : ""); printf("%s", args[4]->tcp_flags & TH_ACK ? "ACK|" : ""); printf("%s", args[4]->tcp_flags & TH_URG ? "URG|" : ""); printf("%s", args[4]->tcp_flags == 0 ? "null " : ""); printf("\\b)\\n"); } .Ed The following script logs TCP connection state changes as they occur: .Bd -literal -offset indent #pragma D option quiet #pragma D option switchrate=25hz int last[int]; dtrace:::BEGIN { printf(" %12s %-20s %-20s %s\\n", "DELTA(us)", "OLD", "NEW", "TIMESTAMP"); } tcp:::state-change { this->elapsed = (timestamp - last[args[1]->cs_cid]) / 1000; printf(" %12d %-20s -> %-20s %d\\n", this->elapsed, tcp_state_string[args[5]->tcps_state], tcp_state_string[args[3]->tcps_state], timestamp); last[args[1]->cs_cid] = timestamp; } tcp:::state-change /last[args[1]->cs_cid] == 0/ { printf(" %12s %-20s -> %-20s %d\\n", "-", tcp_state_string[args[5]->tcps_state], tcp_state_string[args[3]->tcps_state], timestamp); last[args[1]->cs_cid] = timestamp; } .Ed The following script uses the siftr probe to show the current value of CWND and SSTHRESH when a packet is sent or received: .Bd -literal -offset indent #pragma D option quiet #pragma D option switchrate=10hz dtrace:::BEGIN { - printf(" %3s %5s %5s %10s %10s\\n", - "DIR", "LPORT", "RPORT", "CWND", "SSTHRESH"); + printf(" %3s %16s:%-5s %16s:%-5s %10s %10s\\n", + "DIR", "LADDR", "LPORT", "RADDR", "RPORT", "CWND", "SSTHRESH"); } tcp:::siftr { - printf(" %3s %5d %5d %10d %10d\\n", + printf(" %3s %16s:%-5d %16s:%-5d %10u %10u\\n", siftr_dir_string[args[0]->direction], - args[0]->tcp_localport, args[0]->tcp_foreignport, + args[0]->laddr, args[0]->lport, args[0]->raddr, args[0]->rport, args[0]->snd_cwnd, args[0]->snd_ssthresh); } .Ed .Sh COMPATIBILITY This provider is compatible with the .Nm tcp provider in Solaris. .Sh SEE ALSO .Xr dtrace 1 , .Xr dtrace_ip 4 , .Xr dtrace_sctp 4 , .Xr dtrace_udp 4 , .Xr dtrace_udplite 4 , .Xr siftr 4 , .Xr tcp 4 , .Xr SDT 9 .Sh HISTORY The .Nm tcp provider first appeared in .Fx 10.0. .Sh AUTHORS This manual page was written by .An Mark Johnston Aq Mt markj@FreeBSD.org . .Sh BUGS The .Vt tcps_local and .Vt tcps_active fields of .Vt tcpsinfo_t are not filled in by the translator. diff --git a/sys/netinet/siftr.c b/sys/netinet/siftr.c index abbb72a05b06..9154f89fba30 100644 --- a/sys/netinet/siftr.c +++ b/sys/netinet/siftr.c @@ -1,1366 +1,1369 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2007-2009 * Swinburne University of Technology, Melbourne, Australia. * Copyright (c) 2009-2010, The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed at the Centre for Advanced * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by Lawrence Stewart under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /****************************************************** * Statistical Information For TCP Research (SIFTR) * * A FreeBSD kernel module that adds very basic intrumentation to the * TCP stack, allowing internal stats to be recorded to a log file * for experimental, debugging and performance analysis purposes. * * SIFTR was first released in 2007 by James Healy and Lawrence Stewart whilst * working on the NewTCP research project at Swinburne University of * Technology's Centre for Advanced Internet Architectures, Melbourne, * Australia, which was made possible in part by a grant from the Cisco * University Research Program Fund at Community Foundation Silicon Valley. * More details are available at: * http://caia.swin.edu.au/urp/newtcp/ * * Work on SIFTR v1.2.x was sponsored by the FreeBSD Foundation as part of * the "Enhancing the FreeBSD TCP Implementation" project 2008-2009. * More details are available at: * http://www.freebsdfoundation.org/ * http://caia.swin.edu.au/freebsd/etcp09/ * * Lawrence Stewart is the current maintainer, and all contact regarding * SIFTR should be directed to him via email: lastewart@swin.edu.au * * Initial release date: June 2007 * Most recent update: September 2010 ******************************************************/ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SIFTR_IPV6 #include #include #include #include #endif /* SIFTR_IPV6 */ #include /* * Three digit version number refers to X.Y.Z where: * X is the major version number * Y is bumped to mark backwards incompatible changes * Z is bumped to mark backwards compatible changes */ #define V_MAJOR 1 #define V_BACKBREAK 3 #define V_BACKCOMPAT 0 #define MODVERSION __CONCAT(V_MAJOR, __CONCAT(V_BACKBREAK, V_BACKCOMPAT)) #define MODVERSION_STR __XSTRING(V_MAJOR) "." __XSTRING(V_BACKBREAK) "." \ __XSTRING(V_BACKCOMPAT) #define HOOK 0 #define UNHOOK 1 #define SIFTR_EXPECTED_MAX_TCP_FLOWS 65536 #define SYS_NAME "FreeBSD" #define PACKET_TAG_SIFTR 100 #define PACKET_COOKIE_SIFTR 21749576 #define SIFTR_LOG_FILE_MODE 0644 #define SIFTR_DISABLE 0 #define SIFTR_ENABLE 1 /* * Hard upper limit on the length of log messages. Bump this up if you add new * data fields such that the line length could exceed the below value. */ #define MAX_LOG_MSG_LEN 300 /* XXX: Make this a sysctl tunable. */ #define SIFTR_ALQ_BUFLEN (1000*MAX_LOG_MSG_LEN) #ifdef SIFTR_IPV6 #define SIFTR_IPMODE 6 #else #define SIFTR_IPMODE 4 #endif static MALLOC_DEFINE(M_SIFTR, "siftr", "dynamic memory used by SIFTR"); static MALLOC_DEFINE(M_SIFTR_PKTNODE, "siftr_pktnode", "SIFTR pkt_node struct"); static MALLOC_DEFINE(M_SIFTR_HASHNODE, "siftr_hashnode", "SIFTR flow_hash_node struct"); /* Used as links in the pkt manager queue. */ struct pkt_node { /* Timestamp of pkt as noted in the pfil hook. */ struct timeval tval; /* Direction pkt is travelling. */ enum { DIR_IN = 0, DIR_OUT = 1, } direction; /* IP version pkt_node relates to; either INP_IPV4 or INP_IPV6. */ uint8_t ipver; /* Local TCP port. */ - uint16_t tcp_localport; + uint16_t lport; /* Foreign TCP port. */ - uint16_t tcp_foreignport; + uint16_t fport; + /* Local address. */ + union in_dependaddr laddr; + /* Foreign address. */ + union in_dependaddr faddr; /* Congestion Window (bytes). */ uint32_t snd_cwnd; /* Sending Window (bytes). */ uint32_t snd_wnd; /* Receive Window (bytes). */ uint32_t rcv_wnd; /* More tcpcb flags storage */ uint32_t t_flags2; /* Slow Start Threshold (bytes). */ uint32_t snd_ssthresh; /* Current state of the TCP FSM. */ int conn_state; /* Max Segment Size (bytes). */ - u_int max_seg_size; + uint32_t mss; /* Smoothed RTT (usecs). */ uint32_t srtt; /* Is SACK enabled? */ u_char sack_enabled; /* Window scaling for snd window. */ u_char snd_scale; /* Window scaling for recv window. */ u_char rcv_scale; /* TCP control block flags. */ - u_int flags; + u_int t_flags; /* Retransmission timeout (usec). */ uint32_t rto; /* Size of the TCP send buffer in bytes. */ u_int snd_buf_hiwater; /* Current num bytes in the send socket buffer. */ u_int snd_buf_cc; /* Size of the TCP receive buffer in bytes. */ u_int rcv_buf_hiwater; /* Current num bytes in the receive socket buffer. */ u_int rcv_buf_cc; /* Number of bytes inflight that we are waiting on ACKs for. */ u_int sent_inflight_bytes; /* Number of segments currently in the reassembly queue. */ int t_segqlen; /* Flowid for the connection. */ u_int flowid; /* Flow type for the connection. */ u_int flowtype; /* Link to next pkt_node in the list. */ STAILQ_ENTRY(pkt_node) nodes; }; struct flow_info { #ifdef SIFTR_IPV6 char laddr[INET6_ADDRSTRLEN]; /* local IP address */ char faddr[INET6_ADDRSTRLEN]; /* foreign IP address */ #else char laddr[INET_ADDRSTRLEN]; /* local IP address */ char faddr[INET_ADDRSTRLEN]; /* foreign IP address */ #endif uint16_t lport; /* local TCP port */ uint16_t fport; /* foreign TCP port */ - uint8_t ipver; /* IP version */ uint32_t key; /* flowid of the connection */ }; struct flow_hash_node { uint16_t counter; struct flow_info const_info; /* constant connection info */ LIST_ENTRY(flow_hash_node) nodes; }; struct siftr_stats { /* # TCP pkts seen by the SIFTR PFIL hooks, including any skipped. */ uint64_t n_in; uint64_t n_out; /* # pkts skipped due to failed malloc calls. */ uint32_t nskip_in_malloc; uint32_t nskip_out_malloc; /* # pkts skipped due to failed inpcb lookups. */ uint32_t nskip_in_inpcb; uint32_t nskip_out_inpcb; /* # pkts skipped due to failed tcpcb lookups. */ uint32_t nskip_in_tcpcb; uint32_t nskip_out_tcpcb; /* # pkts skipped due to stack reinjection. */ uint32_t nskip_in_dejavu; uint32_t nskip_out_dejavu; }; DPCPU_DEFINE_STATIC(struct siftr_stats, ss); static volatile unsigned int siftr_exit_pkt_manager_thread = 0; static unsigned int siftr_enabled = 0; static unsigned int siftr_pkts_per_log = 1; static uint16_t siftr_port_filter = 0; /* static unsigned int siftr_binary_log = 0; */ static char siftr_logfile[PATH_MAX] = "/var/log/siftr.log"; static char siftr_logfile_shadow[PATH_MAX] = "/var/log/siftr.log"; static u_long siftr_hashmask; STAILQ_HEAD(pkthead, pkt_node) pkt_queue = STAILQ_HEAD_INITIALIZER(pkt_queue); LIST_HEAD(listhead, flow_hash_node) *counter_hash; static int wait_for_pkt; static struct alq *siftr_alq = NULL; static struct mtx siftr_pkt_queue_mtx; static struct mtx siftr_pkt_mgr_mtx; static struct thread *siftr_pkt_manager_thr = NULL; static char direction[2] = {'i','o'}; /* Required function prototypes. */ static int siftr_sysctl_enabled_handler(SYSCTL_HANDLER_ARGS); static int siftr_sysctl_logfile_name_handler(SYSCTL_HANDLER_ARGS); /* Declare the net.inet.siftr sysctl tree and populate it. */ SYSCTL_DECL(_net_inet_siftr); SYSCTL_NODE(_net_inet, OID_AUTO, siftr, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "siftr related settings"); SYSCTL_PROC(_net_inet_siftr, OID_AUTO, enabled, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &siftr_enabled, 0, &siftr_sysctl_enabled_handler, "IU", "switch siftr module operations on/off"); SYSCTL_PROC(_net_inet_siftr, OID_AUTO, logfile, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &siftr_logfile_shadow, sizeof(siftr_logfile_shadow), &siftr_sysctl_logfile_name_handler, "A", "file to save siftr log messages to"); SYSCTL_UINT(_net_inet_siftr, OID_AUTO, ppl, CTLFLAG_RW, &siftr_pkts_per_log, 1, "number of packets between generating a log message"); SYSCTL_U16(_net_inet_siftr, OID_AUTO, port_filter, CTLFLAG_RW, &siftr_port_filter, 0, "enable packet filter on a TCP port"); /* XXX: TODO SYSCTL_UINT(_net_inet_siftr, OID_AUTO, binary, CTLFLAG_RW, &siftr_binary_log, 0, "write log files in binary instead of ascii"); */ /* Begin functions. */ static inline struct flow_hash_node * siftr_find_flow(struct listhead *counter_list, uint32_t id) { struct flow_hash_node *hash_node; /* * If the list is not empty i.e. the hash index has * been used by another flow previously. */ if (LIST_FIRST(counter_list) != NULL) { /* * Loop through the hash nodes in the list. * There should normally only be 1 hash node in the list. */ LIST_FOREACH(hash_node, counter_list, nodes) { /* * Check if the key for the pkt we are currently * processing is the same as the key stored in the * hash node we are currently processing. * If they are the same, then we've found the * hash node that stores the counter for the flow * the pkt belongs to. */ if (hash_node->const_info.key == id) { return hash_node; } } } return NULL; } static inline struct flow_hash_node * siftr_new_hash_node(struct flow_info info, int dir, struct siftr_stats *ss) { struct flow_hash_node *hash_node; struct listhead *counter_list; counter_list = counter_hash + (info.key & siftr_hashmask); /* Create a new hash node to store the flow's constant info. */ hash_node = malloc(sizeof(struct flow_hash_node), M_SIFTR_HASHNODE, M_NOWAIT|M_ZERO); if (hash_node != NULL) { /* Initialise our new hash node list entry. */ hash_node->counter = 0; hash_node->const_info = info; LIST_INSERT_HEAD(counter_list, hash_node, nodes); return hash_node; } else { /* malloc failed */ if (dir == DIR_IN) ss->nskip_in_malloc++; else ss->nskip_out_malloc++; return NULL; } } static void siftr_process_pkt(struct pkt_node * pkt_node) { struct flow_hash_node *hash_node; struct listhead *counter_list; struct ale *log_buf; if (pkt_node->flowid == 0) { panic("%s: flowid not available", __func__); } counter_list = counter_hash + (pkt_node->flowid & siftr_hashmask); hash_node = siftr_find_flow(counter_list, pkt_node->flowid); if (hash_node == NULL) { return; } else if (siftr_pkts_per_log > 1) { /* * Taking the remainder of the counter divided * by the current value of siftr_pkts_per_log * and storing that in counter provides a neat * way to modulate the frequency of log * messages being written to the log file. */ hash_node->counter = (hash_node->counter + 1) % siftr_pkts_per_log; /* * If we have not seen enough packets since the last time * we wrote a log message for this connection, return. */ if (hash_node->counter > 0) return; } log_buf = alq_getn(siftr_alq, MAX_LOG_MSG_LEN, ALQ_WAITOK); if (log_buf == NULL) return; /* Should only happen if the ALQ is shutting down. */ /* Construct a log message. */ log_buf->ae_bytesused = snprintf(log_buf->ae_data, MAX_LOG_MSG_LEN, "%c,%jd.%06ld,%s,%hu,%s,%hu,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u," "%u,%u,%u,%u,%u,%u,%u,%u\n", direction[pkt_node->direction], (intmax_t)pkt_node->tval.tv_sec, pkt_node->tval.tv_usec, hash_node->const_info.laddr, hash_node->const_info.lport, hash_node->const_info.faddr, hash_node->const_info.fport, pkt_node->snd_ssthresh, pkt_node->snd_cwnd, pkt_node->t_flags2, pkt_node->snd_wnd, pkt_node->rcv_wnd, pkt_node->snd_scale, pkt_node->rcv_scale, pkt_node->conn_state, - pkt_node->max_seg_size, + pkt_node->mss, pkt_node->srtt, pkt_node->sack_enabled, - pkt_node->flags, + pkt_node->t_flags, pkt_node->rto, pkt_node->snd_buf_hiwater, pkt_node->snd_buf_cc, pkt_node->rcv_buf_hiwater, pkt_node->rcv_buf_cc, pkt_node->sent_inflight_bytes, pkt_node->t_segqlen, pkt_node->flowid, pkt_node->flowtype); alq_post_flags(siftr_alq, log_buf, 0); } static void siftr_pkt_manager_thread(void *arg) { STAILQ_HEAD(pkthead, pkt_node) tmp_pkt_queue = STAILQ_HEAD_INITIALIZER(tmp_pkt_queue); struct pkt_node *pkt_node, *pkt_node_temp; uint8_t draining; draining = 2; mtx_lock(&siftr_pkt_mgr_mtx); /* draining == 0 when queue has been flushed and it's safe to exit. */ while (draining) { /* * Sleep until we are signalled to wake because thread has * been told to exit or until 1 tick has passed. */ mtx_sleep(&wait_for_pkt, &siftr_pkt_mgr_mtx, PWAIT, "pktwait", 1); /* Gain exclusive access to the pkt_node queue. */ mtx_lock(&siftr_pkt_queue_mtx); /* * Move pkt_queue to tmp_pkt_queue, which leaves * pkt_queue empty and ready to receive more pkt_nodes. */ STAILQ_CONCAT(&tmp_pkt_queue, &pkt_queue); /* * We've finished making changes to the list. Unlock it * so the pfil hooks can continue queuing pkt_nodes. */ mtx_unlock(&siftr_pkt_queue_mtx); /* * We can't hold a mutex whilst calling siftr_process_pkt * because ALQ might sleep waiting for buffer space. */ mtx_unlock(&siftr_pkt_mgr_mtx); /* Flush all pkt_nodes to the log file. */ STAILQ_FOREACH_SAFE(pkt_node, &tmp_pkt_queue, nodes, pkt_node_temp) { siftr_process_pkt(pkt_node); STAILQ_REMOVE_HEAD(&tmp_pkt_queue, nodes); free(pkt_node, M_SIFTR_PKTNODE); } KASSERT(STAILQ_EMPTY(&tmp_pkt_queue), ("SIFTR tmp_pkt_queue not empty after flush")); mtx_lock(&siftr_pkt_mgr_mtx); /* * If siftr_exit_pkt_manager_thread gets set during the window * where we are draining the tmp_pkt_queue above, there might * still be pkts in pkt_queue that need to be drained. * Allow one further iteration to occur after * siftr_exit_pkt_manager_thread has been set to ensure * pkt_queue is completely empty before we kill the thread. * * siftr_exit_pkt_manager_thread is set only after the pfil * hooks have been removed, so only 1 extra iteration * is needed to drain the queue. */ if (siftr_exit_pkt_manager_thread) draining--; } mtx_unlock(&siftr_pkt_mgr_mtx); /* Calls wakeup on this thread's struct thread ptr. */ kthread_exit(); } /* * Check if a given mbuf has the SIFTR mbuf tag. If it does, log the fact that * it's a reinjected packet and return. If it doesn't, tag the mbuf and return. * Return value >0 means the caller should skip processing this mbuf. */ static inline int siftr_chkreinject(struct mbuf *m, int dir, struct siftr_stats *ss) { if (m_tag_locate(m, PACKET_COOKIE_SIFTR, PACKET_TAG_SIFTR, NULL) != NULL) { if (dir == PFIL_IN) ss->nskip_in_dejavu++; else ss->nskip_out_dejavu++; return (1); } else { struct m_tag *tag = m_tag_alloc(PACKET_COOKIE_SIFTR, PACKET_TAG_SIFTR, 0, M_NOWAIT); if (tag == NULL) { if (dir == PFIL_IN) ss->nskip_in_malloc++; else ss->nskip_out_malloc++; return (1); } m_tag_prepend(m, tag); } return (0); } /* * Look up an inpcb for a packet. Return the inpcb pointer if found, or NULL * otherwise. */ static inline struct inpcb * siftr_findinpcb(int ipver, struct ip *ip, struct mbuf *m, uint16_t sport, uint16_t dport, int dir, struct siftr_stats *ss) { struct inpcb *inp; /* We need the tcbinfo lock. */ INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); if (dir == PFIL_IN) inp = (ipver == INP_IPV4 ? in_pcblookup(&V_tcbinfo, ip->ip_src, sport, ip->ip_dst, dport, INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif) : #ifdef SIFTR_IPV6 in6_pcblookup(&V_tcbinfo, &((struct ip6_hdr *)ip)->ip6_src, sport, &((struct ip6_hdr *)ip)->ip6_dst, dport, INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif) #else NULL #endif ); else inp = (ipver == INP_IPV4 ? in_pcblookup(&V_tcbinfo, ip->ip_dst, dport, ip->ip_src, sport, INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif) : #ifdef SIFTR_IPV6 in6_pcblookup(&V_tcbinfo, &((struct ip6_hdr *)ip)->ip6_dst, dport, &((struct ip6_hdr *)ip)->ip6_src, sport, INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif) #else NULL #endif ); /* If we can't find the inpcb, bail. */ if (inp == NULL) { if (dir == PFIL_IN) ss->nskip_in_inpcb++; else ss->nskip_out_inpcb++; } return (inp); } static inline uint32_t siftr_get_flowid(struct inpcb *inp, int ipver, uint32_t *phashtype) { if (inp->inp_flowid == 0) { #ifdef SIFTR_IPV6 if (ipver == INP_IPV6) { return fib6_calc_packet_hash(&inp->in6p_laddr, &inp->in6p_faddr, inp->inp_lport, inp->inp_fport, IPPROTO_TCP, phashtype); } else #endif { return fib4_calc_packet_hash(inp->inp_laddr, inp->inp_faddr, inp->inp_lport, inp->inp_fport, IPPROTO_TCP, phashtype); } } else { *phashtype = inp->inp_flowtype; return inp->inp_flowid; } } static inline void siftr_siftdata(struct pkt_node *pn, struct inpcb *inp, struct tcpcb *tp, int ipver, int dir, int inp_locally_locked) { pn->ipver = ipver; - pn->tcp_localport = inp->inp_lport; - pn->tcp_foreignport = inp->inp_fport; + pn->lport = inp->inp_lport; + pn->fport = inp->inp_fport; + pn->laddr = inp->inp_inc.inc_ie.ie_dependladdr; + pn->faddr = inp->inp_inc.inc_ie.ie_dependfaddr; pn->snd_cwnd = tp->snd_cwnd; pn->snd_wnd = tp->snd_wnd; pn->rcv_wnd = tp->rcv_wnd; pn->t_flags2 = tp->t_flags2; pn->snd_ssthresh = tp->snd_ssthresh; pn->snd_scale = tp->snd_scale; pn->rcv_scale = tp->rcv_scale; pn->conn_state = tp->t_state; - pn->max_seg_size = tp->t_maxseg; + pn->mss = tp->t_maxseg; pn->srtt = ((uint64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT; pn->sack_enabled = (tp->t_flags & TF_SACK_PERMIT) != 0; - pn->flags = tp->t_flags; + pn->t_flags = tp->t_flags; pn->rto = tp->t_rxtcur * tick; pn->snd_buf_hiwater = inp->inp_socket->so_snd.sb_hiwat; pn->snd_buf_cc = sbused(&inp->inp_socket->so_snd); pn->rcv_buf_hiwater = inp->inp_socket->so_rcv.sb_hiwat; pn->rcv_buf_cc = sbused(&inp->inp_socket->so_rcv); pn->sent_inflight_bytes = tp->snd_max - tp->snd_una; pn->t_segqlen = tp->t_segqlen; /* We've finished accessing the tcb so release the lock. */ if (inp_locally_locked) INP_RUNLOCK(inp); pn->direction = (dir == PFIL_IN ? DIR_IN : DIR_OUT); /* * Significantly more accurate than using getmicrotime(), but slower! * Gives true microsecond resolution at the expense of a hit to * maximum pps throughput processing when SIFTR is loaded and enabled. */ microtime(&pn->tval); TCP_PROBE1(siftr, pn); } /* * pfil hook that is called for each IPv4 packet making its way through the * stack in either direction. * The pfil subsystem holds a non-sleepable mutex somewhere when * calling our hook function, so we can't sleep at all. * It's very important to use the M_NOWAIT flag with all function calls * that support it so that they won't sleep, otherwise you get a panic. */ static pfil_return_t siftr_chkpkt(struct mbuf **m, struct ifnet *ifp, int flags, void *ruleset __unused, struct inpcb *inp) { struct pkt_node *pn; struct ip *ip; struct tcphdr *th; struct tcpcb *tp; struct siftr_stats *ss; unsigned int ip_hl; int inp_locally_locked, dir; uint32_t hash_id, hash_type; struct listhead *counter_list; struct flow_hash_node *hash_node; inp_locally_locked = 0; dir = PFIL_DIR(flags); ss = DPCPU_PTR(ss); /* * m_pullup is not required here because ip_{input|output} * already do the heavy lifting for us. */ ip = mtod(*m, struct ip *); /* Only continue processing if the packet is TCP. */ if (ip->ip_p != IPPROTO_TCP) goto ret; /* * Create a tcphdr struct starting at the correct offset * in the IP packet. ip->ip_hl gives the ip header length * in 4-byte words, so multiply it to get the size in bytes. */ ip_hl = (ip->ip_hl << 2); th = (struct tcphdr *)((caddr_t)ip + ip_hl); /* * Only pkts selected by the tcp port filter * can be inserted into the pkt_queue */ if ((siftr_port_filter != 0) && (siftr_port_filter != ntohs(th->th_sport)) && (siftr_port_filter != ntohs(th->th_dport))) { goto ret; } /* * If a kernel subsystem reinjects packets into the stack, our pfil * hook will be called multiple times for the same packet. * Make sure we only process unique packets. */ if (siftr_chkreinject(*m, dir, ss)) goto ret; if (dir == PFIL_IN) ss->n_in++; else ss->n_out++; /* * If the pfil hooks don't provide a pointer to the * inpcb, we need to find it ourselves and lock it. */ if (!inp) { /* Find the corresponding inpcb for this pkt. */ inp = siftr_findinpcb(INP_IPV4, ip, *m, th->th_sport, th->th_dport, dir, ss); if (inp == NULL) goto ret; else inp_locally_locked = 1; } INP_LOCK_ASSERT(inp); /* Find the TCP control block that corresponds with this packet */ tp = intotcpcb(inp); /* * If we can't find the TCP control block (happens occasionaly for a * packet sent during the shutdown phase of a TCP connection), or the * TCP control block has not initialized (happens during TCPS_SYN_SENT), * bail. */ if (tp == NULL || tp->t_state < TCPS_ESTABLISHED) { if (dir == PFIL_IN) ss->nskip_in_tcpcb++; else ss->nskip_out_tcpcb++; goto inp_unlock; } hash_id = siftr_get_flowid(inp, INP_IPV4, &hash_type); counter_list = counter_hash + (hash_id & siftr_hashmask); hash_node = siftr_find_flow(counter_list, hash_id); /* If this flow hasn't been seen before, we create a new entry. */ if (hash_node == NULL) { struct flow_info info; inet_ntoa_r(inp->inp_laddr, info.laddr); inet_ntoa_r(inp->inp_faddr, info.faddr); info.lport = ntohs(inp->inp_lport); info.fport = ntohs(inp->inp_fport); info.key = hash_id; - info.ipver = INP_IPV4; hash_node = siftr_new_hash_node(info, dir, ss); } if (hash_node == NULL) { goto inp_unlock; } pn = malloc(sizeof(struct pkt_node), M_SIFTR_PKTNODE, M_NOWAIT|M_ZERO); if (pn == NULL) { if (dir == PFIL_IN) ss->nskip_in_malloc++; else ss->nskip_out_malloc++; goto inp_unlock; } pn->flowid = hash_id; pn->flowtype = hash_type; siftr_siftdata(pn, inp, tp, INP_IPV4, dir, inp_locally_locked); mtx_lock(&siftr_pkt_queue_mtx); STAILQ_INSERT_TAIL(&pkt_queue, pn, nodes); mtx_unlock(&siftr_pkt_queue_mtx); goto ret; inp_unlock: if (inp_locally_locked) INP_RUNLOCK(inp); ret: return (PFIL_PASS); } #ifdef SIFTR_IPV6 static pfil_return_t siftr_chkpkt6(struct mbuf **m, struct ifnet *ifp, int flags, void *ruleset __unused, struct inpcb *inp) { struct pkt_node *pn; struct ip6_hdr *ip6; struct tcphdr *th; struct tcpcb *tp; struct siftr_stats *ss; unsigned int ip6_hl; int inp_locally_locked, dir; uint32_t hash_id, hash_type; struct listhead *counter_list; struct flow_hash_node *hash_node; inp_locally_locked = 0; dir = PFIL_DIR(flags); ss = DPCPU_PTR(ss); /* * m_pullup is not required here because ip6_{input|output} * already do the heavy lifting for us. */ ip6 = mtod(*m, struct ip6_hdr *); /* * Only continue processing if the packet is TCP * XXX: We should follow the next header fields * as shown on Pg 6 RFC 2460, but right now we'll * only check pkts that have no extension headers. */ if (ip6->ip6_nxt != IPPROTO_TCP) goto ret6; /* * Create a tcphdr struct starting at the correct offset * in the ipv6 packet. */ ip6_hl = sizeof(struct ip6_hdr); th = (struct tcphdr *)((caddr_t)ip6 + ip6_hl); /* * Only pkts selected by the tcp port filter * can be inserted into the pkt_queue */ if ((siftr_port_filter != 0) && (siftr_port_filter != ntohs(th->th_sport)) && (siftr_port_filter != ntohs(th->th_dport))) { goto ret6; } /* * If a kernel subsystem reinjects packets into the stack, our pfil * hook will be called multiple times for the same packet. * Make sure we only process unique packets. */ if (siftr_chkreinject(*m, dir, ss)) goto ret6; if (dir == PFIL_IN) ss->n_in++; else ss->n_out++; /* * For inbound packets, the pfil hooks don't provide a pointer to the * inpcb, so we need to find it ourselves and lock it. */ if (!inp) { /* Find the corresponding inpcb for this pkt. */ inp = siftr_findinpcb(INP_IPV6, (struct ip *)ip6, *m, th->th_sport, th->th_dport, dir, ss); if (inp == NULL) goto ret6; else inp_locally_locked = 1; } /* Find the TCP control block that corresponds with this packet. */ tp = intotcpcb(inp); /* * If we can't find the TCP control block (happens occasionaly for a * packet sent during the shutdown phase of a TCP connection), or the * TCP control block has not initialized (happens during TCPS_SYN_SENT), * bail. */ if (tp == NULL || tp->t_state < TCPS_ESTABLISHED) { if (dir == PFIL_IN) ss->nskip_in_tcpcb++; else ss->nskip_out_tcpcb++; goto inp_unlock6; } hash_id = siftr_get_flowid(inp, INP_IPV6, &hash_type); counter_list = counter_hash + (hash_id & siftr_hashmask); hash_node = siftr_find_flow(counter_list, hash_id); /* If this flow hasn't been seen before, we create a new entry. */ if (!hash_node) { struct flow_info info; ip6_sprintf(info.laddr, &inp->in6p_laddr); ip6_sprintf(info.faddr, &inp->in6p_faddr); info.lport = ntohs(inp->inp_lport); info.fport = ntohs(inp->inp_fport); info.key = hash_id; - info.ipver = INP_IPV6; hash_node = siftr_new_hash_node(info, dir, ss); } if (!hash_node) { goto inp_unlock6; } pn = malloc(sizeof(struct pkt_node), M_SIFTR_PKTNODE, M_NOWAIT|M_ZERO); if (pn == NULL) { if (dir == PFIL_IN) ss->nskip_in_malloc++; else ss->nskip_out_malloc++; goto inp_unlock6; } pn->flowid = hash_id; pn->flowtype = hash_type; siftr_siftdata(pn, inp, tp, INP_IPV6, dir, inp_locally_locked); mtx_lock(&siftr_pkt_queue_mtx); STAILQ_INSERT_TAIL(&pkt_queue, pn, nodes); mtx_unlock(&siftr_pkt_queue_mtx); goto ret6; inp_unlock6: if (inp_locally_locked) INP_RUNLOCK(inp); ret6: return (PFIL_PASS); } #endif /* #ifdef SIFTR_IPV6 */ VNET_DEFINE_STATIC(pfil_hook_t, siftr_inet_hook); #define V_siftr_inet_hook VNET(siftr_inet_hook) #ifdef SIFTR_IPV6 VNET_DEFINE_STATIC(pfil_hook_t, siftr_inet6_hook); #define V_siftr_inet6_hook VNET(siftr_inet6_hook) #endif static int siftr_pfil(int action) { struct pfil_hook_args pha = { .pa_version = PFIL_VERSION, .pa_flags = PFIL_IN | PFIL_OUT, .pa_modname = "siftr", .pa_rulname = "default", }; struct pfil_link_args pla = { .pa_version = PFIL_VERSION, .pa_flags = PFIL_IN | PFIL_OUT | PFIL_HEADPTR | PFIL_HOOKPTR, }; VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); if (action == HOOK) { pha.pa_mbuf_chk = siftr_chkpkt; pha.pa_type = PFIL_TYPE_IP4; V_siftr_inet_hook = pfil_add_hook(&pha); pla.pa_hook = V_siftr_inet_hook; pla.pa_head = V_inet_pfil_head; (void)pfil_link(&pla); #ifdef SIFTR_IPV6 pha.pa_mbuf_chk = siftr_chkpkt6; pha.pa_type = PFIL_TYPE_IP6; V_siftr_inet6_hook = pfil_add_hook(&pha); pla.pa_hook = V_siftr_inet6_hook; pla.pa_head = V_inet6_pfil_head; (void)pfil_link(&pla); #endif } else if (action == UNHOOK) { pfil_remove_hook(V_siftr_inet_hook); #ifdef SIFTR_IPV6 pfil_remove_hook(V_siftr_inet6_hook); #endif } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); return (0); } static int siftr_sysctl_logfile_name_handler(SYSCTL_HANDLER_ARGS) { struct alq *new_alq; int error; error = sysctl_handle_string(oidp, arg1, arg2, req); /* Check for error or same filename */ if (error != 0 || req->newptr == NULL || strncmp(siftr_logfile, arg1, arg2) == 0) goto done; /* file name changed */ error = alq_open(&new_alq, arg1, curthread->td_ucred, SIFTR_LOG_FILE_MODE, SIFTR_ALQ_BUFLEN, 0); if (error != 0) goto done; /* * If disabled, siftr_alq == NULL so we simply close * the alq as we've proved it can be opened. * If enabled, close the existing alq and switch the old * for the new. */ if (siftr_alq == NULL) { alq_close(new_alq); } else { alq_close(siftr_alq); siftr_alq = new_alq; } /* Update filename upon success */ strlcpy(siftr_logfile, arg1, arg2); done: return (error); } static int siftr_manage_ops(uint8_t action) { struct siftr_stats totalss; struct timeval tval; struct flow_hash_node *counter, *tmp_counter; struct sbuf *s; int i, error; uint32_t bytes_to_write, total_skipped_pkts; error = 0; total_skipped_pkts = 0; /* Init an autosizing sbuf that initially holds 200 chars. */ if ((s = sbuf_new(NULL, NULL, 200, SBUF_AUTOEXTEND)) == NULL) return (-1); if (action == SIFTR_ENABLE && siftr_pkt_manager_thr == NULL) { /* * Create our alq * XXX: We should abort if alq_open fails! */ alq_open(&siftr_alq, siftr_logfile, curthread->td_ucred, SIFTR_LOG_FILE_MODE, SIFTR_ALQ_BUFLEN, 0); STAILQ_INIT(&pkt_queue); DPCPU_ZERO(ss); siftr_exit_pkt_manager_thread = 0; kthread_add(&siftr_pkt_manager_thread, NULL, NULL, &siftr_pkt_manager_thr, RFNOWAIT, 0, "siftr_pkt_manager_thr"); siftr_pfil(HOOK); microtime(&tval); sbuf_printf(s, "enable_time_secs=%jd\tenable_time_usecs=%06ld\t" "siftrver=%s\tsysname=%s\tsysver=%u\tipmode=%u\n", (intmax_t)tval.tv_sec, tval.tv_usec, MODVERSION_STR, SYS_NAME, __FreeBSD_version, SIFTR_IPMODE); sbuf_finish(s); alq_writen(siftr_alq, sbuf_data(s), sbuf_len(s), ALQ_WAITOK); } else if (action == SIFTR_DISABLE && siftr_pkt_manager_thr != NULL) { /* * Remove the pfil hook functions. All threads currently in * the hook functions are allowed to exit before siftr_pfil() * returns. */ siftr_pfil(UNHOOK); /* This will block until the pkt manager thread unlocks it. */ mtx_lock(&siftr_pkt_mgr_mtx); /* Tell the pkt manager thread that it should exit now. */ siftr_exit_pkt_manager_thread = 1; /* * Wake the pkt_manager thread so it realises that * siftr_exit_pkt_manager_thread == 1 and exits gracefully. * The wakeup won't be delivered until we unlock * siftr_pkt_mgr_mtx so this isn't racy. */ wakeup(&wait_for_pkt); /* Wait for the pkt_manager thread to exit. */ mtx_sleep(siftr_pkt_manager_thr, &siftr_pkt_mgr_mtx, PWAIT, "thrwait", 0); siftr_pkt_manager_thr = NULL; mtx_unlock(&siftr_pkt_mgr_mtx); totalss.n_in = DPCPU_VARSUM(ss, n_in); totalss.n_out = DPCPU_VARSUM(ss, n_out); totalss.nskip_in_malloc = DPCPU_VARSUM(ss, nskip_in_malloc); totalss.nskip_out_malloc = DPCPU_VARSUM(ss, nskip_out_malloc); totalss.nskip_in_tcpcb = DPCPU_VARSUM(ss, nskip_in_tcpcb); totalss.nskip_out_tcpcb = DPCPU_VARSUM(ss, nskip_out_tcpcb); totalss.nskip_in_inpcb = DPCPU_VARSUM(ss, nskip_in_inpcb); totalss.nskip_out_inpcb = DPCPU_VARSUM(ss, nskip_out_inpcb); total_skipped_pkts = totalss.nskip_in_malloc + totalss.nskip_out_malloc + totalss.nskip_in_tcpcb + totalss.nskip_out_tcpcb + totalss.nskip_in_inpcb + totalss.nskip_out_inpcb; microtime(&tval); sbuf_printf(s, "disable_time_secs=%jd\tdisable_time_usecs=%06ld\t" "num_inbound_tcp_pkts=%ju\tnum_outbound_tcp_pkts=%ju\t" "total_tcp_pkts=%ju\tnum_inbound_skipped_pkts_malloc=%u\t" "num_outbound_skipped_pkts_malloc=%u\t" "num_inbound_skipped_pkts_tcpcb=%u\t" "num_outbound_skipped_pkts_tcpcb=%u\t" "num_inbound_skipped_pkts_inpcb=%u\t" "num_outbound_skipped_pkts_inpcb=%u\t" "total_skipped_tcp_pkts=%u\tflow_list=", (intmax_t)tval.tv_sec, tval.tv_usec, (uintmax_t)totalss.n_in, (uintmax_t)totalss.n_out, (uintmax_t)(totalss.n_in + totalss.n_out), totalss.nskip_in_malloc, totalss.nskip_out_malloc, totalss.nskip_in_tcpcb, totalss.nskip_out_tcpcb, totalss.nskip_in_inpcb, totalss.nskip_out_inpcb, total_skipped_pkts); /* * Iterate over the flow hash, printing a summary of each * flow seen and freeing any malloc'd memory. * The hash consists of an array of LISTs (man 3 queue). */ for (i = 0; i <= siftr_hashmask; i++) { LIST_FOREACH_SAFE(counter, counter_hash + i, nodes, tmp_counter) { sbuf_printf(s, "%s;%hu-%s;%hu,", counter->const_info.laddr, counter->const_info.lport, counter->const_info.faddr, counter->const_info.fport); free(counter, M_SIFTR_HASHNODE); } LIST_INIT(counter_hash + i); } sbuf_printf(s, "\n"); sbuf_finish(s); i = 0; do { bytes_to_write = min(SIFTR_ALQ_BUFLEN, sbuf_len(s)-i); alq_writen(siftr_alq, sbuf_data(s)+i, bytes_to_write, ALQ_WAITOK); i += bytes_to_write; } while (i < sbuf_len(s)); alq_close(siftr_alq); siftr_alq = NULL; } else error = EINVAL; sbuf_delete(s); /* * XXX: Should be using ret to check if any functions fail * and set error appropriately */ return (error); } static int siftr_sysctl_enabled_handler(SYSCTL_HANDLER_ARGS) { int error; uint32_t new; new = siftr_enabled; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr != NULL) { if (new > 1) return (EINVAL); else if (new != siftr_enabled) { if ((error = siftr_manage_ops(new)) == 0) { siftr_enabled = new; } else { siftr_manage_ops(SIFTR_DISABLE); } } } return (error); } static void siftr_shutdown_handler(void *arg) { if (siftr_enabled == 1) { siftr_manage_ops(SIFTR_DISABLE); } } /* * Module is being unloaded or machine is shutting down. Take care of cleanup. */ static int deinit_siftr(void) { /* Cleanup. */ siftr_manage_ops(SIFTR_DISABLE); hashdestroy(counter_hash, M_SIFTR, siftr_hashmask); mtx_destroy(&siftr_pkt_queue_mtx); mtx_destroy(&siftr_pkt_mgr_mtx); return (0); } /* * Module has just been loaded into the kernel. */ static int init_siftr(void) { EVENTHANDLER_REGISTER(shutdown_pre_sync, siftr_shutdown_handler, NULL, SHUTDOWN_PRI_FIRST); /* Initialise our flow counter hash table. */ counter_hash = hashinit(SIFTR_EXPECTED_MAX_TCP_FLOWS, M_SIFTR, &siftr_hashmask); mtx_init(&siftr_pkt_queue_mtx, "siftr_pkt_queue_mtx", NULL, MTX_DEF); mtx_init(&siftr_pkt_mgr_mtx, "siftr_pkt_mgr_mtx", NULL, MTX_DEF); /* Print message to the user's current terminal. */ uprintf("\nStatistical Information For TCP Research (SIFTR) %s\n" " http://caia.swin.edu.au/urp/newtcp\n\n", MODVERSION_STR); return (0); } /* * This is the function that is called to load and unload the module. * When the module is loaded, this function is called once with * "what" == MOD_LOAD * When the module is unloaded, this function is called twice with * "what" = MOD_QUIESCE first, followed by "what" = MOD_UNLOAD second * When the system is shut down e.g. CTRL-ALT-DEL or using the shutdown command, * this function is called once with "what" = MOD_SHUTDOWN * When the system is shut down, the handler isn't called until the very end * of the shutdown sequence i.e. after the disks have been synced. */ static int siftr_load_handler(module_t mod, int what, void *arg) { int ret; switch (what) { case MOD_LOAD: ret = init_siftr(); break; case MOD_QUIESCE: case MOD_SHUTDOWN: ret = deinit_siftr(); break; case MOD_UNLOAD: ret = 0; break; default: ret = EINVAL; break; } return (ret); } static moduledata_t siftr_mod = { .name = "siftr", .evhand = siftr_load_handler, }; /* * Param 1: name of the kernel module * Param 2: moduledata_t struct containing info about the kernel module * and the execution entry point for the module * Param 3: From sysinit_sub_id enumeration in /usr/include/sys/kernel.h * Defines the module initialisation order * Param 4: From sysinit_elem_order enumeration in /usr/include/sys/kernel.h * Defines the initialisation order of this kld relative to others * within the same subsystem as defined by param 3 */ DECLARE_MODULE(siftr, siftr_mod, SI_SUB_LAST, SI_ORDER_ANY); MODULE_DEPEND(siftr, alq, 1, 1, 1); MODULE_VERSION(siftr, MODVERSION);