Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F159971506
D57309.id179809.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
118 KB
Referenced Files
None
Subscribers
None
D57309.id179809.diff
View Options
diff --git a/include/eventlog/tcp_eventlog_schema.src b/include/eventlog/tcp_eventlog_schema.src
new file mode 100644
--- /dev/null
+++ b/include/eventlog/tcp_eventlog_schema.src
@@ -0,0 +1,656 @@
+/*
+ * Copyright (c) 2026 Netflix, Inc.
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+/*
+ * This file defines the schema for TCP event logging.
+ * It is processed by eventlog_gen.awk to generate the event log header
+ *
+ * Format:
+ * PROVIDER <name>
+ * Defines the provider name used as a prefix for all generated definitions
+ *
+ * ENUM <name> <value1>:<name1> <value2>:<name2> ...
+ * Defines an enumeration for converting numeric values to strings.
+ * Used for single-value fields (e.g., ENUM timer_reason 0:PACING 1:DELACK)
+ *
+ * FLAG <name> <value1>:<name1> <value2>:<name2> ...
+ * Defines a flag set for converting bitmask values to strings.
+ * Used for fields that can have multiple flags combined (e.g., FLAG tcp_flags 1:SYN 2:ACK)
+ *
+ * STRUCT <name> <field1>:<type1>[:enum_or_flag] <field2>:<type2>[:enum_or_flag] ...
+ * Defines a structure for event data. Each field is specified as name:type[:enum_or_flag]
+ * (e.g., STRUCT IN seq:uint32_t ack:uint32_t)
+ * For enum/flag fields, use enum_<enum_name> or flag_<flag_name> (e.g., reason:uint8_t:enum_timer_reason)
+ *
+ * KEYWORD <name> <value>
+ * Defines a keyword flag with a numeric value (e.g., KEYWORD DATA 1)
+ *
+ * EVENT <name> <id> <level> <keywords> <struct> <print_format>
+ * Defines an event. Extra spaces are allowed for alignment.
+ * Format strings should use %s for enum/flag fields (e.g., "reason=%s")
+ *
+ * Where:
+ * name: Event name (e.g., IN)
+ * id: Numeric event ID
+ * level: Log level (ERROR, WARN, INFO, VERBOSE)
+ * keywords: Comma-separated list of keyword flags (e.g., DATA)
+ * struct: Name of the STRUCT definition to use for this event, or NONE for
+ * events with no payload. NONE events generate macros that take only
+ * the session parameter and emit zero-length payloads.
+ * print_format: printf-style format string for text output (use %s for enum/flag fields)
+ *
+ * Schema evolution
+ *
+ * Event payloads are decoded by struct layout: the consumer maps an event ID
+ * to a STRUCT and reads sizeof(STRUCT) bytes (or, for size-aware consumers,
+ * the payload length carried in the event header). Two kinds of changes are
+ * supported:
+ *
+ * 1. Appending fields to an existing STRUCT is allowed. Size-aware
+ * consumers will pick up the new tail; older consumers that decode by
+ * exact-size match must be updated (or made size-aware) before they
+ * can read captures emitted by the new kernel. Existing on-disk
+ * captures with the smaller payload continue to decode against the
+ * older field list — readers should treat any missing tail fields as
+ * zero.
+ *
+ * 2. Reordering, removing, or retyping fields in an existing STRUCT is
+ * NOT allowed — it silently breaks decoding of older captures.
+ * Instead, define a new STRUCT and a new EVENT with a `_V2` (or
+ * `_V3`, ...) suffix and a new event ID, put the new fields there,
+ * and mark the previous EVENT as deprecated in a comment while
+ * keeping its definition so older captures continue to decode.
+ *
+ * Consumers learn new event IDs at their own pace; unrecognized event IDs
+ * render as [UNKNOWN_EVENT_ID:N].
+ */
+
+PROVIDER tcp
+
+/* Packet direction */
+KEYWORD RX 1
+KEYWORD TX 2
+/* Loss detection and recovery (RTO, retransmit, DSACK, enter/exit recovery) */
+KEYWORD LOSS 3
+/* Congestion control algorithm state (cwnd, PRR, hystart, goodput, RTT) */
+KEYWORD CC 4
+/* Pacing decisions and hardware offload */
+KEYWORD PACING 5
+/* Protocol timer machinery */
+KEYWORD TIMER 6
+/* Application-facing events (sends, socket options, socket buffer wakeups) */
+KEYWORD APP 7
+/* Connection lifecycle and identity (address binding, log ID) */
+KEYWORD CONN 8
+/* High-frequency RTT samples (every SRTT update) */
+KEYWORD RTT 9
+
+/*
+ * Enumerations
+ */
+
+ENUM timer_reason
+ 0:PACING
+ 1:DELAYACK
+ 2:RACK
+ 3:TLP
+ 4:RXT
+ 5:PERSIST
+ 6:KEEPALIVE
+ 7:TWOMSL
+
+ENUM tcp_state
+ 0:CLOSED
+ 1:LISTEN
+ 2:SYN_SENT
+ 3:SYN_RECEIVED
+ 4:ESTABLISHED
+ 5:CLOSE_WAIT
+ 6:FIN_WAIT_1
+ 7:CLOSING
+ 8:LAST_ACK
+ 9:FIN_WAIT_2
+ 10:TIME_WAIT
+
+ENUM socket_option
+ 1:TCP_NODELAY
+ 2:TCP_MAXSEG
+ 4:TCP_NOPUSH
+ 8:TCP_NOOPT
+ 16:TCP_MD5SIG
+ 32:TCP_INFO
+ 33:TCP_STATS
+ 34:TCP_LOG
+ 35:TCP_LOGBUF
+ 36:TCP_LOGID
+ 37:TCP_LOGDUMP
+ 38:TCP_LOGDUMPID
+ 39:TCP_TXTLS_ENABLE
+ 40:TCP_TXTLS_MODE
+ 41:TCP_RXTLS_ENABLE
+ 42:TCP_RXTLS_MODE
+ 43:TCP_IWND_NB
+ 44:TCP_IWND_NSEG
+ 45:TCP_USE_DDP
+ 46:TCP_LOGID_CNT
+ 47:TCP_LOG_TAG
+ 48:TCP_USER_LOG
+ 64:TCP_CONGESTION
+ 65:TCP_CCALGOOPT
+ 68:TCP_MAXUNACKTIME
+ 70:TCP_IDLE_REDUCE
+ 71:TCP_REMOTE_UDP_ENCAPS_PORT
+ 72:TCP_DELACK
+ 73:TCP_FIN_IS_RST
+ 74:TCP_LOG_LIMIT
+ 75:TCP_SHARED_CWND_ALLOWED
+ 76:TCP_PROC_ACCOUNTING
+ 77:TCP_USE_CMP_ACKS
+ 78:TCP_PERF_INFO
+ 79:TCP_RST_REASON_CODE
+ 80:TCP_RST_REASON_ENABLE
+ 81:TCP_EVENTLOG
+ 128:TCP_KEEPINIT
+ 256:TCP_KEEPIDLE
+ 512:TCP_KEEPINTVL
+ 1024:TCP_KEEPCNT
+ 1025:TCP_FASTOPEN
+ 1026:TCP_REUSPORT_LB_NUMA
+ 1050:TCP_RACK_MBUF_QUEUE
+ 1052:TCP_RACK_TLP_REDUCE
+ 1054:TCP_RACK_PACE_MAX_SEG
+ 1055:TCP_RACK_PACE_ALWAYS
+ 1057:TCP_RACK_PRR_SENDALOT
+ 1058:TCP_RACK_MIN_TO
+ 1060:TCP_RACK_EARLY_SEG
+ 1061:TCP_RACK_REORD_THRESH
+ 1062:TCP_RACK_REORD_FADE
+ 1063:TCP_RACK_TLP_THRESH
+ 1064:TCP_RACK_PKT_DELAY
+ 1067:TCP_BBR_IWINTSO
+ 1069:TCP_BBR_STARTUP_PG
+ 1070:TCP_BBR_DRAIN_PG
+ 1072:TCP_BBR_PROBE_RTT_INT
+ 1074:TCP_BBR_STARTUP_LOSS_EXIT
+ 1076:TCP_BBR_TSLIMITS
+ 1077:TCP_BBR_PACE_OH
+ 1079:TCP_BBR_USEDEL_RATE
+ 1080:TCP_BBR_MIN_RTO
+ 1081:TCP_BBR_MAX_RTO
+ 1083:TCP_BBR_ALGORITHM
+ 1086:TCP_BBR_PACE_PER_SEC
+ 1087:TCP_BBR_PACE_DEL_TAR
+ 1088:TCP_BBR_PACE_SEG_MAX
+ 1089:TCP_BBR_PACE_SEG_MIN
+ 1090:TCP_BBR_PACE_CROSS
+ 1095:TCP_RACK_TLP_USE
+ 1096:TCP_BBR_TMR_PACE_OH
+ 1097:TCP_RACK_DO_DETECTION
+ 1098:TCP_BBR_RACK_RTT_USE
+ 1099:TCP_BBR_RETRAN_WTSO
+ 1100:TCP_DATA_AFTER_CLOSE
+ 1101:TCP_BBR_PROBE_RTT_GAIN
+ 1102:TCP_BBR_PROBE_RTT_LEN
+ 1103:TCP_BBR_SEND_IWND_IN_TSO
+ 1104:TCP_BBR_USE_RACK_RR
+ 1105:TCP_BBR_HDWR_PACE
+ 1106:TCP_BBR_UTTER_MAX_TSO
+ 1107:TCP_BBR_EXTRA_STATE
+ 1108:TCP_BBR_FLOOR_MIN_TSO
+ 1109:TCP_BBR_MIN_TOPACEOUT
+ 1110:TCP_BBR_TSTMP_RAISES
+ 1111:TCP_BBR_POLICER_DETECT
+ 1112:TCP_BBR_RACK_INIT_RATE
+ 1113:TCP_RACK_RR_CONF
+ 1114:TCP_RACK_GP_INCREASE_CA
+ 1115:TCP_RACK_GP_INCREASE_SS
+ 1116:TCP_RACK_GP_INCREASE_REC
+ 1117:TCP_RACK_FORCE_MSEG
+ 1118:TCP_RACK_PACE_RATE_CA
+ 1119:TCP_RACK_PACE_RATE_SS
+ 1120:TCP_RACK_PACE_RATE_REC
+ 1122:TCP_NO_PRR
+ 1123:TCP_RACK_NONRXT_CFG_RATE
+ 1124:TCP_SHARED_CWND_ENABLE
+ 1125:TCP_TIMELY_DYN_ADJ
+ 1126:TCP_RACK_NO_PUSH_AT_MAX
+ 1127:TCP_RACK_PACE_TO_FILL
+ 1128:TCP_SHARED_CWND_TIME_LIMIT
+ 1129:TCP_RACK_PROFILE
+ 1130:TCP_HDWR_RATE_CAP
+ 1131:TCP_PACING_RATE_CAP
+ 1132:TCP_HDWR_UP_ONLY
+ 1133:TCP_RACK_ABC_VAL
+ 1134:TCP_REC_ABC_VAL
+ 1135:TCP_RACK_MEASURE_CNT
+ 1136:TCP_DEFER_OPTIONS
+ 1138:TCP_RACK_PACING_BETA
+ 1139:TCP_RACK_PACING_BETA_ECN
+ 1140:TCP_RACK_TIMER_SLOP
+ 1141:TCP_RACK_DSACK_OPT
+ 1142:TCP_RACK_ENABLE_HYSTART
+ 1143:TCP_RACK_SET_RXT_OPTIONS
+ 1144:TCP_RACK_HI_BETA
+ 1145:TCP_RACK_SPLIT_LIMIT
+ 1146:TCP_RACK_PACING_DIVISOR
+ 1147:TCP_RACK_PACE_MIN_SEG
+ 1148:TCP_RACK_DGP_IN_REC
+ 1150:TCP_HYBRID_PACING
+ 1151:TCP_PACING_DND
+ 1152:TCP_SS_EEXIT
+ 1153:TCP_DGP_UPPER_BOUNDS
+ 1154:TCP_NO_TIMELY
+ 1155:TCP_HONOR_HPTS_MIN
+ 1156:TCP_REC_IS_DYN
+ 1157:TCP_SIDECHAN_DIS
+ 1158:TCP_FILLCW_RATE_CAP
+ 1160:TCP_STACK_SPEC_INFO
+ 1161:RACK_CSPR_IS_FCC
+ 1162:TCP_GP_USE_LTBW
+ 1163:TCP_NGEN_GPUT_MAXSEG
+ 8192:TCP_FUNCTION_BLK
+ 8193:TCP_FUNCTION_ALIAS
+ 2147483643:TCP_RXT_CLAMP
+ 2147483645:TCP_LOG_DETAILED
+ 2147483647:TCP_LOG_RATIO
+
+ENUM sb_wake_type
+ 1:RCV
+ 2:SND
+
+ENUM output_way
+ 0:NORMAL
+ 1:PERSIST
+ 2:HPTS
+ 3:OUTPUT
+ 4:DROP_SYNFIN
+ 5:DROP_MISSING_TS
+ 6:COMPLETED
+
+ENUM collapse_dir
+ 0:UNCOLLAPSED
+ 1:COLLAPSED
+ 2:RSM_MARKED
+ 3:SPLIT
+
+/*
+ * Flags
+ */
+
+FLAG tcp_flags
+ 0x01:FIN
+ 0x02:SYN
+ 0x04:RST
+ 0x08:PUSH
+ 0x10:ACK
+ 0x20:URG
+ 0x40:ECE
+ 0x80:CWR
+ 0x100:AE
+
+FLAG sb_flags
+ 0x01:SB_TLS_RX
+ 0x02:SB_TLS_RX_RUNNING
+ 0x04:SB_WAIT
+ 0x08:SB_SEL
+ 0x10:SB_ASYNC
+ 0x20:SB_UPCALL
+ 0x40:SB_AUTOLOWAT
+ 0x80:SB_AIO
+ 0x100:SB_KNOTE
+ 0x200:SB_NOCOALESCE
+ 0x400:SB_IN_TOE
+ 0x800:SB_AUTOSIZE
+ 0x2000:SB_AIO_RUNNING
+ 0x4000:SB_SPLICED
+ 0x8000:SB_TLS_RX_RESYNC
+
+FLAG sb_state
+ 0x10:SBS_CANTSENDMORE
+ 0x20:SBS_CANTRCVMORE
+ 0x40:SBS_RCVATMARK
+
+FLAG sendfile_flags
+ 0x01:SF_NODISKIO
+ 0x02:SF_MNOWAIT
+ 0x08:SF_USER_READAHEAD
+ 0x10:SF_NOCACHE
+
+FLAG prus_flags
+ 0x1:PRUS_OOB
+ 0x2:PRUS_EOF
+ 0x4:PRUS_MORETOCOME
+ 0x8:PRUS_NOTREADY
+
+FLAG conn_params_flags
+ 0x01:CP_SACK_PERMIT
+ 0x02:CP_TIMESTAMPS
+ 0x04:CP_ECN
+ 0x08:CP_ACE
+ 0x10:CP_FASTOPEN
+ 0x20:CP_SIGNATURE
+ 0x40:CP_WINDOW_SCALE
+
+/*
+ * Structures - grouped by event category
+ */
+
+/* Connection / Session */
+STRUCT SESSION_CREATE
+ tp:void*
+STRUCT CONN_SET_IP_V4
+ src_addr:in_addr_t
+ src_port:uint16_t:ntohs
+ dst_addr:in_addr_t
+ dst_port:uint16_t:ntohs
+STRUCT CONN_SET_IP_V6
+ src_addr:in6_addr_t
+ src_port:uint16_t:ntohs
+ dst_addr:in6_addr_t
+ dst_port:uint16_t:ntohs
+STRUCT LOG_ID
+ log_id:char[64]
+STRUCT CONN_PARAMS
+ iss:uint32_t
+ irs:uint32_t
+ mss:uint32_t
+ snd_wscale:uint8_t
+ rcv_wscale:uint8_t
+ flags:uint8_t:flag_conn_params_flags
+ cc_algo:char[16]
+ tcp_stack:char[32]
+STRUCT TCP_STACK_CHANGE
+ tcp_stack:char[32]
+STRUCT CC_ALGO_CHANGE
+ cc_algo:char[16]
+STRUCT MSS
+ mss:uint32_t
+STRUCT CONN_OWNER
+ pid:int32_t
+
+/* Data plane */
+STRUCT IN
+ seq:uint32_t
+ ack:uint32_t
+ flags:uint16_t:flag_tcp_flags
+ win_raw:uint16_t
+ win:uint32_t
+ len:uint32_t
+ state:uint8_t:enum_tcp_state
+ th_off:uint8_t
+ th_sum:uint16_t:hex
+ th_urp:uint16_t
+ rcv_time_us:uint64_t
+STRUCT OUT
+ seq:uint32_t
+ ack:uint32_t
+ flags:uint16_t:flag_tcp_flags
+ win_raw:uint16_t
+ win:uint32_t
+ len:uint32_t
+ cwnd:uint32_t
+ state:uint8_t:enum_tcp_state
+ th_off:uint8_t
+ th_sum:uint16_t:hex
+ th_urp:uint16_t
+STRUCT USER_SEND
+ error:int32_t
+ flags:uint32_t:flag_prus_flags
+ state:uint8_t:enum_tcp_state
+ sndbuf_used:uint32_t
+ sndbuf_avail:uint32_t
+ send_len:uint32_t
+STRUCT SENDFILE
+ offset:uint64_t
+ length:uint64_t
+ flags:uint32_t:flag_sendfile_flags
+STRUCT ACK
+ acked:uint32_t
+ snd_una:uint32_t
+ inflight:uint32_t
+ sacked:uint32_t
+ nsegs:uint32_t
+ rcv_time_us:uint64_t
+STRUCT SACK
+ acked:uint32_t
+ snd_una:uint32_t
+ inflight:uint32_t
+ sacked:uint32_t
+ nsegs:uint32_t
+ rcv_time_us:uint64_t
+ num_blocks:uint8_t
+ blk1_start:uint32_t
+ blk1_end:uint32_t
+ blk2_start:uint32_t
+ blk2_end:uint32_t
+ blk3_start:uint32_t
+ blk3_end:uint32_t
+ blk4_start:uint32_t
+ blk4_end:uint32_t
+
+/* Congestion control / Window */
+STRUCT CWND
+ cwnd:uint32_t
+ ssthresh:uint32_t
+STRUCT CC_PARAMS
+ beta:uint32_t
+ beta_ecn:uint32_t
+STRUCT ENTER_RECOVERY
+ cwnd:uint32_t
+ ssthresh:uint32_t
+STRUCT EXIT_RECOVERY
+ cwnd:uint32_t
+ ssthresh:uint32_t
+STRUCT PRR_STATE
+ prr_out:uint32_t
+ recovery_fs:uint32_t
+ prr_sndcnt:uint32_t
+ prr_delivered:uint32_t
+ sacked:uint32_t
+ holes_rxt:uint32_t
+ inflight:uint32_t
+STRUCT WINDOW_COLLAPSE
+ count:uint32_t
+ direction:uint8_t:enum_collapse_dir
+ inflight:uint32_t
+ has_collapsed:uint8_t
+STRUCT HYSTART
+ round:uint32_t
+ high_seq:uint32_t
+ snd_max:uint32_t
+ mod:uint8_t
+STRUCT GOODPUT
+ gp_bw:uint64_t
+ raw_bw:uint64_t
+ utim:uint64_t
+ gp_srtt:uint32_t
+ lowest_rtt:uint32_t
+
+/* RTT */
+STRUCT RTT
+ rtt:uint32_t
+ srtt:uint32_t
+ path_rtt:uint32_t
+ lowest_rtt:uint32_t
+ gp_filter_min:uint32_t
+ gp_srtt:uint32_t
+
+STRUCT RTT_SAMPLE
+ rtt:uint32_t
+
+/* Loss / Retransmission */
+STRUCT RTO
+ timeout_num:int32_t
+ min_rtt:uint32_t
+ rtt:uint32_t
+ segment_len:uint32_t
+ inflight:uint32_t
+ pkts_out:uint32_t
+STRUCT RETRANSMIT_REASON
+ threshold:uint32_t
+ dupack_count:uint32_t
+ rsm_start:uint32_t
+ rsm_end:uint32_t
+ inflight:uint32_t
+STRUCT DSACK
+ dsack_round_seen:uint8_t
+ dsack_round_end:uint32_t
+ num_dsack:uint32_t
+ dsack_persist:uint32_t
+ mod:uint8_t
+ round:uint32_t
+
+/* Scheduling / Timers */
+STRUCT TIMER_START
+ timeout_us:uint64_t
+ reason:uint8_t:enum_timer_reason
+STRUCT TIMER_CANCEL
+ reason:uint8_t:enum_timer_reason
+STRUCT TIMER_FIRED
+ reason:uint8_t:enum_timer_reason
+STRUCT TIMER_PROCESSED
+ reason:uint8_t:enum_timer_reason
+STRUCT SB_WAKE
+ sb_flags:uint32_t:flag_sb_flags
+ len:uint32_t
+ sb_state:uint32_t:flag_sb_state
+ type:uint8_t:enum_sb_wake_type
+
+/* Pacing */
+STRUCT PACING_CALC
+ slot:uint32_t
+ bw_est:uint64_t
+ method:uint8_t
+ quality:uint8_t
+ in_slow_start:uint8_t
+ inflight:uint32_t
+STRUCT PACE_STATE
+ rc_pace_min_segs:uint32_t
+ rc_pace_max_segs:uint32_t
+STRUCT HW_PACE_RATE
+ requested_rate:uint64_t
+ hw_rate:uint64_t
+ error:int32_t
+ mod:uint8_t
+
+/* Configuration */
+STRUCT SOCKET_OPT_UINT32
+ option_num:uint32_t:enum_socket_option
+ option_val:uint32_t
+STRUCT SOCKET_OPT_ERR
+ option_num:uint32_t:enum_socket_option
+ err:int32_t
+
+/* Output */
+STRUCT OUTPUT_DECISION
+ did_output:int32_t
+ nxt_pkt:int32_t
+ way_out:int32_t:enum_output_way
+ nsegs:int32_t
+ inflight:uint32_t
+ in_persist:uint8_t
+
+/*
+ * Events - grouped by category, renumbered for logical ordering
+ */
+
+/* Session */
+EVENT SESSION_CREATE 4294967294 INFO SESSION SESSION_CREATE
+ "Session created (tp=%1)"
+
+/* Connection (0-99) */
+EVENT CONN_SET_IP_V4 0 INFO CONN CONN_SET_IP_V4
+ "IPv4 connection %1:%2 -> %3:%4"
+EVENT CONN_SET_IP_V6 1 INFO CONN CONN_SET_IP_V6
+ "IPv6 connection %1:%2 -> %3:%4"
+EVENT LOG_ID 2 INFO CONN LOG_ID
+ "Log ID assigned: %1"
+EVENT CONN_PARAMS 3 INFO CONN CONN_PARAMS
+ "Connection parameters: ISS %1, IRS %2, MSS %3, snd_wscale %4, rcv_wscale %5, flags %6, cc %7, stack %8"
+EVENT MSS 4 INFO CONN MSS
+ "MSS set to %1 bytes"
+EVENT CONN_OWNER 5 INFO CONN CONN_OWNER
+ "Connection owned by PID %1"
+EVENT TCP_STACK_CHANGE 6 INFO CONN TCP_STACK_CHANGE
+ "TCP stack changed to %1"
+EVENT CC_ALGO_CHANGE 7 INFO CONN,CC CC_ALGO_CHANGE
+ "CC algorithm changed to %1"
+
+/* Data plane (100-199) */
+EVENT IN 100 VERBOSE RX IN
+ "Received %6 bytes (seq %1, ack %2, win %5, flags %3) in state %7 rcv_time %11 us"
+EVENT OUT 101 VERBOSE TX OUT
+ "Sent %6 bytes (seq %1, ack %2, win %5, flags %3) in state %8 cwnd %7"
+EVENT ACK 102 VERBOSE RX,CC ACK
+ "Acknowledged %1 bytes to %2 (%3 in flight, %4 sacked, %5 segments, rcv_time %6 us)"
+EVENT SACK 103 VERBOSE RX,CC SACK
+ "SACKed %1 bytes to %2 (%3 in flight, %4 sacked, %5 segs, rcv_time %6 us, %7 blocks: [%8-%9] [%10-%11] [%12-%13] [%14-%15])"
+EVENT USER_SEND 104 INFO TX,APP USER_SEND
+ "Application send of %6 bytes in state %3 (%4 used, %5 available in sndbuf, flags %2, error %1)"
+EVENT SENDFILE 105 INFO TX,APP SENDFILE
+ "Sendfile %2 bytes at offset %1 (flags %3)"
+
+/* Congestion control / Window (200-299) */
+EVENT CWND 200 VERBOSE CC CWND
+ "Congestion window now %1 (ssthresh %2)"
+EVENT CC_PARAMS 201 INFO CC CC_PARAMS
+ "CC parameters changed: beta %1, beta_ecn %2"
+EVENT ENTER_RECOVERY 202 WARN CC,LOSS ENTER_RECOVERY
+ "Entered loss recovery (cwnd %1, ssthresh %2)"
+EVENT EXIT_RECOVERY 203 WARN CC,LOSS EXIT_RECOVERY
+ "Exited loss recovery (cwnd %1, ssthresh %2)"
+EVENT PRR_STATE 204 TRACE CC,LOSS PRR_STATE
+ "PRR recovery: %1 sent of %3 allowed, %4 delivered of %2 flight (%5 sacked, %6 retransmitted, %7 in flight)"
+EVENT WINDOW_COLLAPSE 205 WARN CC WINDOW_COLLAPSE
+ "Peer window %2: %1 segments affected (%3 bytes in flight)"
+EVENT HYSTART 206 TRACE CC HYSTART
+ "Slow start round %1 completed (high_seq %2, snd_max %3, event %4)"
+EVENT GOODPUT 207 INFO CC GOODPUT
+ "Goodput sample %2 bps (EWMA %1 bps) over %3 us, gp_srtt %4 us, lowest_rtt %5 us"
+
+/* RTT (300-399) */
+EVENT RTT 300 INFO RTT RTT
+ "RTT sample %1 us applied to srtt %2 us (path %3 us, all-time low %4 us, GP min %5 us, gp_srtt %6 us)"
+EVENT RTT_SAMPLE 301 TRACE RTT RTT_SAMPLE
+ "RTT sample %1 us"
+
+/* Loss / Retransmission (400-499) */
+EVENT RTO 400 WARN LOSS,TX RTO
+ "Timeout #%1 (rtt %3 us, min_rtt %2 us, %4 byte segment, %5 in flight, %6 pkts out)"
+EVENT RETRANSMIT_REASON 401 WARN LOSS,TX RETRANSMIT_REASON
+ "Retransmitting bytes %3-%4 (threshold %1, %2 dupacks, %5 in flight)"
+EVENT DSACK 402 INFO LOSS,RX DSACK
+ "DSACK detected: %3 total in round %6 (persist %4)"
+
+/* Timers (500-599) */
+EVENT TIMER_START 500 VERBOSE TIMER TIMER_START
+ "Started %2 timer for %1 us"
+EVENT TIMER_CANCEL 501 VERBOSE TIMER TIMER_CANCEL
+ "Canceled %1 timer"
+EVENT TIMER_FIRED 502 VERBOSE TIMER TIMER_FIRED
+ "Fired %1 timer"
+EVENT SB_WAKE 503 VERBOSE APP SB_WAKE
+ "Socket buffer %4 wakeup (%2 bytes, flags %1, state %3)"
+EVENT TIMER_PROCESSED 504 VERBOSE TIMER TIMER_PROCESSED
+ "Processed %1 timer"
+
+/* Pacing (600-699) */
+EVENT PACING_CALC 600 VERBOSE PACING PACING_CALC
+ "Pacing delay %1 us at %2 bps (method %3, quality %4, %6 in flight)"
+EVENT PACE_STATE 601 TRACE PACING PACE_STATE
+ "Pacing burst size: min %1, max %2 bytes per segment"
+EVENT HW_PACE_RATE 602 INFO PACING HW_PACE_RATE
+ "Hardware pacer set to %2 bps (requested %1 bps, error %3)"
+
+/* Configuration (700-799) */
+EVENT SOCKET_OPT_UINT32 700 INFO APP,CONN SOCKET_OPT_UINT32
+ "Socket option %1 set to %2"
+EVENT SOCKET_OPT_ERR 701 WARN APP,CONN SOCKET_OPT_ERR
+ "Socket option %1 failed with error %2"
+
+/* Output (800-899) */
+EVENT OUTPUT_DECISION 800 TRACE TX OUTPUT_DECISION
+ "Segment processing done: sent %4 segments via %3 (%5 in flight, persist %6)"
diff --git a/share/man/man4/tcp.4 b/share/man/man4/tcp.4
--- a/share/man/man4/tcp.4
+++ b/share/man/man4/tcp.4
@@ -31,7 +31,7 @@
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE.
.\"
-.Dd May 7, 2026
+.Dd May 14, 2026
.Dt TCP 4
.Os
.Sh NAME
@@ -145,6 +145,24 @@
See
.Xr mod_cc 4
for details.
+.It Dv TCP_EVENTLOG
+Enable or disable per-connection event logging via the kernel eventlog framework.
+This option accepts a single integer argument: a non-zero value enables event
+logging for the connection, and zero disables it.
+.Pp
+When enabled, the TCP stack emits structured events for the connection
+.Pq e.g., segment transmission and reception, RTT samples, congestion window changes, retransmissions
+that can be captured by the
+.Xr elog 1
+user-space tool.
+Enabling eventlog on a connection also triggers an immediate dump of the
+current connection state, so that a subscriber sees the full context even if
+it attached after the connection was established.
+.Pp
+The system-wide default for new connections is controlled by the
+.Va kern.eventlog.tcp.default
+sysctl, documented in
+.Xr eventlog 9 .
.It Dv TCP_FASTOPEN
Enable or disable TCP Fast Open (TFO).
To use this option, the kernel must be built with the
@@ -1092,6 +1110,7 @@
when trying to use a TCP function block that is not available;
.El
.Sh SEE ALSO
+.Xr elog 1 ,
.Xr getsockopt 2 ,
.Xr setfib 2 ,
.Xr socket 2 ,
@@ -1110,6 +1129,7 @@
.Xr tcp_rack 4 ,
.Xr setkey 8 ,
.Xr sysctl 8 ,
+.Xr eventlog 9 ,
.Xr tcp_functions 9
.Rs
.%A "V. Jacobson"
diff --git a/sys/kern/kern_sendfile.c b/sys/kern/kern_sendfile.c
--- a/sys/kern/kern_sendfile.c
+++ b/sys/kern/kern_sendfile.c
@@ -1130,12 +1130,12 @@
td);
sendfile_iodone(sfio, NULL, 0, error);
}
-#ifdef TCP_REQUEST_TRK
if (so->so_proto->pr_protocol == IPPROTO_TCP) {
- /* log the sendfile call to the TCP log, if enabled */
+ tcp_eventlog_sendfile(so, offset, nbytes, flags);
+#ifdef TCP_REQUEST_TRK
tcp_log_sendfile(so, offset, nbytes, flags);
- }
#endif
+ }
m = NULL;
if (error)
goto done;
diff --git a/sys/modules/cc/cc_cubic/Makefile b/sys/modules/cc/cc_cubic/Makefile
--- a/sys/modules/cc/cc_cubic/Makefile
+++ b/sys/modules/cc/cc_cubic/Makefile
@@ -2,4 +2,6 @@
KMOD= cc_cubic
SRCS= cc_cubic.c opt_kern_tls.h
+EVENTLOG_SCHEMA= tcp_eventlog_schema.src
+
.include <bsd.kmod.mk>
diff --git a/sys/modules/cc/cc_newreno/Makefile b/sys/modules/cc/cc_newreno/Makefile
--- a/sys/modules/cc/cc_newreno/Makefile
+++ b/sys/modules/cc/cc_newreno/Makefile
@@ -2,4 +2,6 @@
KMOD= cc_newreno
SRCS= cc_newreno.c opt_kern_tls.h
+EVENTLOG_SCHEMA= tcp_eventlog_schema.src
+
.include <bsd.kmod.mk>
diff --git a/sys/modules/tcp/bbr/Makefile b/sys/modules/tcp/bbr/Makefile
--- a/sys/modules/tcp/bbr/Makefile
+++ b/sys/modules/tcp/bbr/Makefile
@@ -11,6 +11,8 @@
SRCS+= opt_kern_tls.h
SRCS+= opt_ratelimit.h
+EVENTLOG_SCHEMA= tcp_eventlog_schema.src
+
#
# Enable full debugging
#
diff --git a/sys/modules/tcp/rack/Makefile b/sys/modules/tcp/rack/Makefile
--- a/sys/modules/tcp/rack/Makefile
+++ b/sys/modules/tcp/rack/Makefile
@@ -11,6 +11,8 @@
SRCS+= opt_kern_tls.h
SRCS+= opt_ratelimit.h
+EVENTLOG_SCHEMA= tcp_eventlog_schema.src
+
#
# Enable full debugging
#
diff --git a/sys/netinet/cc/cc_cubic.c b/sys/netinet/cc/cc_cubic.c
--- a/sys/netinet/cc/cc_cubic.c
+++ b/sys/netinet/cc/cc_cubic.c
@@ -69,6 +69,8 @@
#include <netinet/tcp_var.h>
#include <netinet/tcp_log_buf.h>
#include <netinet/tcp_hpts.h>
+#include <sys/eventlog.h>
+#include <eventlog/tcp_eventlog.h>
#include <netinet/cc/cc.h>
#include <netinet/cc/cc_cubic.h>
#include <netinet/cc/cc_module.h>
@@ -123,9 +125,11 @@
*/
struct tcpcb *tp;
+ tp = ccv->tp;
+ TCP_EVENTLOG_HYSTART_LOG(tp->t_eventlog_session,
+ cubicd->css_current_round, 0, tp->snd_max, mod);
if (hystart_bblogs == 0)
return;
- tp = ccv->tp;
if (tcp_bblogging_on(tp)) {
union tcp_log_stackspecific log;
struct timeval tv;
diff --git a/sys/netinet/cc/cc_newreno.c b/sys/netinet/cc/cc_newreno.c
--- a/sys/netinet/cc/cc_newreno.c
+++ b/sys/netinet/cc/cc_newreno.c
@@ -79,6 +79,8 @@
#include <netinet/tcp_var.h>
#include <netinet/tcp_log_buf.h>
#include <netinet/tcp_hpts.h>
+#include <sys/eventlog.h>
+#include <eventlog/tcp_eventlog.h>
#include <netinet/cc/cc.h>
#include <netinet/cc/cc_module.h>
#include <netinet/cc/cc_newreno.h>
@@ -133,9 +135,13 @@
*/
struct tcpcb *tp;
- if (hystart_bblogs == 0)
+ if ((mod != 7) && (mod != 2) && (mod != 6))
return;
tp = ccv->tp;
+ TCP_EVENTLOG_HYSTART_LOG(tp->t_eventlog_session,
+ nreno->css_current_round, 0, tp->snd_max, mod);
+ if (hystart_bblogs == 0)
+ return;
if (tcp_bblogging_on(tp)) {
union tcp_log_stackspecific log;
struct timeval tv;
diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h
--- a/sys/netinet/tcp.h
+++ b/sys/netinet/tcp.h
@@ -219,6 +219,7 @@
#define TCP_PROC_ACCOUNTING 76 /* Do accounting on tcp cpu usage and counts */
#define TCP_USE_CMP_ACKS 77 /* The transport can handle the Compressed mbuf acks */
#define TCP_PERF_INFO 78 /* retrieve accounting counters */
+#define TCP_EVENTLOG 81 /* enable/disable eventlog */
#define TCP_KEEPINIT 128 /* N, time to establish connection */
#define TCP_KEEPIDLE 256 /* L,N,X start keeplives after this period */
#define TCP_KEEPINTVL 512 /* L,N interval between keepalives */
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -107,6 +107,8 @@
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_log_buf.h>
+#include <netinet/tcp_hpts.h>
+#include <eventlog/tcp_eventlog.h>
#include <netinet6/tcp6_var.h>
#include <netinet/tcpip.h>
#include <netinet/cc/cc.h>
@@ -1513,6 +1515,18 @@
TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
tlen, NULL, true);
+ TCP_EVENTLOG_IN_LOG(tp->t_eventlog_session,
+ th->th_seq,
+ th->th_ack,
+ thflags,
+ th->th_win,
+ (uint32_t)th->th_win << tp->snd_scale,
+ tlen,
+ tp->t_state,
+ th->th_off,
+ ntohs(th->th_sum),
+ th->th_urp,
+ tcp_get_u64_usecs(NULL));
if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
@@ -1805,7 +1819,19 @@
* typically means increasing the congestion
* window.
*/
+ {
+ uint32_t cwnd_before = tp->snd_cwnd;
cc_ack_received(tp, th, nsegs, CC_ACK);
+ TCP_EVENTLOG_ACK_LOG(tp->t_eventlog_session,
+ acked, th->th_ack,
+ tp->snd_max - th->th_ack,
+ 0, nsegs,
+ tcp_get_u64_usecs(NULL));
+ if (tp->snd_cwnd != cwnd_before)
+ TCP_EVENTLOG_CWND_LOG(
+ tp->t_eventlog_session,
+ tp->snd_cwnd, tp->snd_ssthresh);
+ }
tp->snd_una = th->th_ack;
/*
@@ -2038,6 +2064,8 @@
tcp_ecn_input_syn_sent(tp, thflags, iptos);
+ tcp_eventlog_conn_params(tp);
+
/*
* Received <SYN,ACK> in SYN_SENT[*] state.
* Transitions:
@@ -2650,6 +2678,9 @@
}
/* Congestion signal before ack. */
cc_cong_signal(tp, th, CC_NDUPACK);
+ TCP_EVENTLOG_ENTER_RECOVERY_LOG(
+ tp->t_eventlog_session,
+ tp->snd_cwnd, tp->snd_ssthresh);
cc_ack_received(tp, th, nsegs, CC_DUPACK);
tcp_timer_activate(tp, TT_REXMT, 0);
tp->t_rtttime = 0;
@@ -2953,7 +2984,18 @@
* control related information. This typically means increasing
* the congestion window.
*/
+ {
+ uint32_t cwnd_before = tp->snd_cwnd;
cc_ack_received(tp, th, nsegs, CC_ACK);
+ TCP_EVENTLOG_ACK_LOG(tp->t_eventlog_session,
+ acked, th->th_ack,
+ tp->snd_max - th->th_ack,
+ 0, nsegs,
+ tcp_get_u64_usecs(NULL));
+ if (tp->snd_cwnd != cwnd_before)
+ TCP_EVENTLOG_CWND_LOG(tp->t_eventlog_session,
+ tp->snd_cwnd, tp->snd_ssthresh);
+ }
if (acked > sbavail(&so->so_snd)) {
if (tp->snd_wnd >= sbavail(&so->so_snd))
@@ -2983,6 +3025,9 @@
if (IN_RECOVERY(tp->t_flags) &&
SEQ_GEQ(th->th_ack, tp->snd_recover)) {
cc_post_recovery(tp, th);
+ TCP_EVENTLOG_EXIT_RECOVERY_LOG(
+ tp->t_eventlog_session,
+ tp->snd_cwnd, tp->snd_ssthresh);
}
if (SEQ_GT(tp->snd_una, tp->snd_recover)) {
tp->snd_recover = tp->snd_una;
@@ -3616,6 +3661,14 @@
TCPSTAT_INC(tcps_rttupdated);
if (tp->t_rttupdated < UCHAR_MAX)
tp->t_rttupdated++;
+
+ TCP_EVENTLOG_RTT_LOG(tp->t_eventlog_session,
+ rtt * 1000,
+ (tp->t_srtt >> TCP_RTT_SHIFT) * 1000,
+ 0,
+ tp->t_rttlow,
+ 0, 0);
+
#ifdef STATS
stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT,
imax(0, rtt * 1000 / hz));
@@ -3850,6 +3903,7 @@
mss = max(mss, 64);
tp->t_maxseg = mss;
+ TCP_EVENTLOG_MSS_LOG(tp->t_eventlog_session, tp->t_maxseg);
if (tp->t_maxseg < V_tcp_mssdflt) {
/*
* The MSS is so small we should not process incoming
@@ -3912,6 +3966,7 @@
* XXXGL: shouldn't we reserve space for IP/IPv6 options?
*/
tp->t_maxseg = max(mss, 64);
+ TCP_EVENTLOG_MSS_LOG(tp->t_eventlog_session, tp->t_maxseg);
if (tp->t_maxseg < V_tcp_mssdflt) {
/*
* The MSS is so small we should not process incoming
diff --git a/sys/netinet/tcp_log_buf.h b/sys/netinet/tcp_log_buf.h
--- a/sys/netinet/tcp_log_buf.h
+++ b/sys/netinet/tcp_log_buf.h
@@ -573,6 +573,7 @@
#ifdef DDB
void db_print_bblog_entries(struct tcp_log_stailq *log_entries, int indent);
#endif
+const char *tcp_log_id_str(struct tcpcb *tp);
#else /* !TCP_BLACKBOX */
#define tcp_log_verbose (false)
@@ -586,6 +587,11 @@
return (NULL);
}
+static inline const char *
+tcp_log_id_str(struct tcpcb *tp __unused)
+{
+ return (NULL);
+}
#endif /* TCP_BLACKBOX */
#endif /* _KERNEL */
diff --git a/sys/netinet/tcp_log_buf.c b/sys/netinet/tcp_log_buf.c
--- a/sys/netinet/tcp_log_buf.c
+++ b/sys/netinet/tcp_log_buf.c
@@ -355,6 +355,13 @@
RB_GENERATE_STATIC(tcp_log_id_tree, tcp_log_id_bucket, tlb_rb, tcp_log_id_cmp)
+const char *
+tcp_log_id_str(struct tcpcb *tp)
+{
+
+ return (tp->t_lib != NULL) ? tp->t_lib->tlb_id : NULL;
+}
+
static __inline void
tcp_log_id_validate_tree_lock(int tree_locked)
{
@@ -500,6 +507,8 @@
tcp_log_id_validate_tree_lock(*tree_locked);
TCPID_BUCKET_LOCK_ASSERT(tlb);
+ tcp_eventlog_on_log_id_clear(tp);
+
/*
* Remove the node, clear the log bucket and node from the TCPCB, and
* decrement the bucket refcount. In the process, if this is the
@@ -987,6 +996,9 @@
SLIST_INSERT_HEAD(&tlb->tlb_head, tln, tln_list);
tp->t_lib = tlb;
tp->t_lin = tln;
+
+ tcp_eventlog_on_log_id_set(tp, tlb->tlb_id);
+
if (tp->t_lib->tlb_logstate > TCP_LOG_STATE_OFF) {
/* Clone in any logging */
diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@@ -79,6 +79,7 @@
#include <netinet/tcp_seq.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_log_buf.h>
+#include <eventlog/tcp_eventlog.h>
#include <netinet/tcp_syncache.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcpip.h>
@@ -1426,6 +1427,19 @@
else
lgb = NULL;
+ TCP_EVENTLOG_OUT_LOG(tp->t_eventlog_session,
+ ntohl(th->th_seq),
+ ntohl(th->th_ack),
+ tcp_get_flags(th),
+ ntohs(th->th_win),
+ (uint32_t)ntohs(th->th_win) << tp->rcv_scale,
+ len,
+ tp->snd_cwnd,
+ tp->t_state,
+ th->th_off,
+ ntohs(th->th_sum),
+ ntohs(th->th_urp));
+
/*
* Fill in IP length and desired time to live and
* send to IP level. There should be a better way
diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c
--- a/sys/netinet/tcp_stacks/bbr.c
+++ b/sys/netinet/tcp_stacks/bbr.c
@@ -92,6 +92,7 @@
#include <netinet/tcp_hpts.h>
#include <netinet/cc/cc.h>
#include <netinet/tcp_log_buf.h>
+#include <eventlog/tcp_eventlog.h>
#include <netinet/tcp_ratelimit.h>
#include <netinet/tcp_lro.h>
#ifdef TCP_OFFLOAD
@@ -2265,6 +2266,18 @@
TCP_LOG_IN, 0,
tlen, &log, true, &bbr->rc_tv);
}
+ TCP_EVENTLOG_IN_LOG(bbr->rc_tp->t_eventlog_session,
+ th->th_seq,
+ th->th_ack,
+ tcp_get_flags(th),
+ th->th_win,
+ (uint32_t)th->th_win << bbr->rc_tp->snd_scale,
+ tlen,
+ bbr->rc_tp->t_state,
+ th->th_off,
+ ntohs(th->th_sum),
+ th->th_urp,
+ tcp_tv_to_lusec(&bbr->rc_tv));
}
static void
@@ -8789,6 +8802,7 @@
(TF_RCVD_SCALE | TF_REQ_SCALE)) {
tp->rcv_scale = tp->request_r_scale;
}
+ tcp_eventlog_conn_params(tp);
tp->rcv_adv += min(tp->rcv_wnd,
TCP_MAXWIN << tp->rcv_scale);
/*
@@ -13505,6 +13519,18 @@
} else {
lgb = NULL;
}
+ TCP_EVENTLOG_OUT_LOG(tp->t_eventlog_session,
+ ntohl(th->th_seq),
+ ntohl(th->th_ack),
+ tcp_get_flags(th),
+ ntohs(th->th_win),
+ (uint32_t)ntohs(th->th_win) << tp->rcv_scale,
+ len,
+ tp->snd_cwnd,
+ tp->t_state,
+ th->th_off,
+ ntohs(th->th_sum),
+ ntohs(th->th_urp));
/*
* Fill in IP length and desired time to live and send to IP level.
* There should be a better way to handle ttl and tos; we could keep
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -88,6 +88,8 @@
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_log_buf.h>
+#include <sys/eventlog.h>
+#include <eventlog/tcp_eventlog.h>
#include <netinet/tcp_syncache.h>
#include <netinet/tcp_hpts.h>
#include <netinet/tcp_ratelimit.h>
@@ -2828,6 +2830,8 @@
static void
rack_log_rtt_sample_calc(struct tcp_rack *rack, uint32_t rtt, uint32_t send_time, uint32_t ack_time, int where)
{
+ TCP_EVENTLOG_RTT_SAMPLE_LOG(rack->rc_tp->t_eventlog_session, rtt);
+
if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
union tcp_log_stackspecific log;
struct timeval tv;
@@ -5413,6 +5417,21 @@
tp->t_ccv.flags |= CCF_USE_LOCAL_ABC;
CC_ALGO(tp)->ack_received(&tp->t_ccv, type);
}
+ if (tp->snd_cwnd != prior_cwnd)
+ TCP_EVENTLOG_CWND_LOG(tp->t_eventlog_session,
+ tp->snd_cwnd, tp->snd_ssthresh);
+ /*
+ * XXX rack_ack_received() does not currently have access to the
+ * caller's SACK blocks, so we always log a plain ACK event. Once
+ * the rack stack threads SACK info into rack_ack_received(), this
+ * call should switch to TCP_EVENTLOG_SACK_LOG when SACK info is
+ * present.
+ */
+ TCP_EVENTLOG_ACK_LOG(tp->t_eventlog_session,
+ acked, th_ack,
+ ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked),
+ rack->r_ctl.rc_sacked, nsegs,
+ tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time));
if (lgb) {
lgb->tlb_stackinfo.u_bbr.flex6 = tp->snd_cwnd;
}
@@ -5476,9 +5495,8 @@
static void
rack_exit_recovery(struct tcpcb *tp, struct tcp_rack *rack, int how)
{
- /*
- * Now exit recovery.
- */
+ TCP_EVENTLOG_EXIT_RECOVERY_LOG(tp->t_eventlog_session,
+ tp->snd_cwnd, tp->snd_ssthresh);
EXIT_RECOVERY(tp->t_flags);
}
@@ -5660,6 +5678,8 @@
CC_ALGO(tp)->cong_signal(&tp->t_ccv, type);
}
if ((in_rec_at_entry == 0) && IN_RECOVERY(tp->t_flags)) {
+ TCP_EVENTLOG_ENTER_RECOVERY_LOG(tp->t_eventlog_session,
+ tp->snd_cwnd, tp->snd_ssthresh);
rack_log_to_prr(rack, 15, cwnd_enter, line);
rack->r_ctl.dsack_byte_cnt = 0;
rack->r_ctl.retran_during_recovery = 0;
@@ -7654,6 +7674,13 @@
} else if ((tp->t_flags & TF_RCVD_TSTMP) == 0)
tp->t_flags &= ~TF_PREVVALID;
KMOD_TCPSTAT_INC(tcps_rexmttimeo);
+ TCP_EVENTLOG_RTO_LOG(tp->t_eventlog_session,
+ tp->t_rxtshift,
+ tp->t_rttlow,
+ tp->t_srtt >> TCP_RTT_SHIFT,
+ tp->t_maxseg,
+ tp->snd_max - tp->snd_una,
+ 0);
if ((tp->t_state == TCPS_SYN_SENT) ||
(tp->t_state == TCPS_SYN_RECEIVED))
rexmt = RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift];
@@ -8660,6 +8687,13 @@
RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
max(rack_rto_min, rtt + 2), rack_rto_max, rack->r_ctl.timer_slop);
rack_log_rtt_sample(rack, rtt);
+ TCP_EVENTLOG_RTT_LOG(tp->t_eventlog_session,
+ rtt,
+ tp->t_srtt,
+ rack->r_ctl.rack_rs.rs_us_rtt,
+ rack->r_ctl.rc_lowest_us_rtt,
+ get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
+ rack->r_ctl.rc_gp_srtt);
tp->t_softerror = 0;
}
@@ -12808,6 +12842,8 @@
tcp_ecn_input_syn_sent(tp, thflags, iptos);
+ tcp_eventlog_conn_params(tp);
+
if (SEQ_GT(th->th_ack, tp->snd_una)) {
/*
* We advance snd_una for the
@@ -24693,6 +24729,12 @@
goto free_uma;
}
rack_init_sysctls();
+ static const struct eventlog_provider_config tcp_eventlog_cfg = {
+ .dump_callback = tcp_eventlog_dump_state,
+ .default_changed = tcp_eventlog_default_changed,
+ };
+ __tcp_rack.tfb_eventlog_provider = eventlog_provider_create(
+ "tcp", &tcp_eventlog_cfg);
num_stacks = nitems(rack_stack_names);
err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK,
rack_stack_names, &num_stacks);
@@ -24700,6 +24742,9 @@
printf("Failed to register %s stack name for "
"%s module\n", rack_stack_names[num_stacks],
__XSTRING(MODNAME));
+ eventlog_provider_destroy(
+ __tcp_rack.tfb_eventlog_provider);
+ __tcp_rack.tfb_eventlog_provider = NULL;
sysctl_ctx_free(&rack_sysctl_ctx);
free_uma:
uma_zdestroy(rack_zone);
@@ -24726,6 +24771,8 @@
rack_mod_inited = false;
}
tcp_lro_dereg_mbufq();
+ eventlog_provider_destroy(__tcp_rack.tfb_eventlog_provider);
+ __tcp_rack.tfb_eventlog_provider = NULL;
err = 0;
break;
default:
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -102,6 +102,7 @@
#include <netinet/tcp_var.h>
#include <netinet/tcp_ecn.h>
#include <netinet/tcp_log_buf.h>
+#include <eventlog/tcp_eventlog.h>
#include <netinet/tcp_syncache.h>
#include <netinet/tcp_hpts.h>
#include <netinet/tcp_lro.h>
@@ -256,6 +257,14 @@
SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW,
&tcp_log_debug, 0, "Log errors caused by incoming TCP segments");
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, eventlog, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "TCP eventlog controls");
+
+static int tcp_eventlog_log_id_limit = 0;
+SYSCTL_INT(_net_inet_tcp_eventlog, OID_AUTO, log_id_limit, CTLFLAG_RW,
+ &tcp_eventlog_log_id_limit, 0,
+ "Max eventlog sessions to create for Log ID buckets (-1=unlimited, 0=none); decrements as used");
+
/*
* Target size of TCP PCB hash tables. Must be a power of two.
*
@@ -492,6 +501,287 @@
return (rblk);
}
+void
+tcp_eventlog_default_changed(struct eventlog_provider *provider __unused,
+ int value, void *arg __unused)
+{
+ VNET_ITERATOR_DECL(vnet_iter);
+ struct inpcb *inp;
+ int enabled;
+
+ if (value != -1 && value != 2)
+ return;
+
+ enabled = (value == 2) ? 1 : 0;
+
+ VNET_LIST_RLOCK();
+ VNET_FOREACH(vnet_iter) {
+ CURVNET_SET(vnet_iter);
+ struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo,
+ INPLOOKUP_RLOCKPCB);
+
+ while ((inp = inp_next(&inpi)) != NULL) {
+ struct tcpcb *tp = intotcpcb(inp);
+ if (tp->t_eventlog_session == NULL)
+ continue;
+ if (enabled &&
+ !eventlog_session_is_enabled(
+ tp->t_eventlog_session)) {
+ eventlog_session_set_enabled(
+ tp->t_eventlog_session, 1);
+ tcp_eventlog_dump_session(tp,
+ tcp_log_id_str(tp));
+ } else if (!enabled) {
+ eventlog_session_set_enabled(
+ tp->t_eventlog_session, 0);
+ }
+ }
+ CURVNET_RESTORE();
+ }
+ VNET_LIST_RUNLOCK();
+}
+
+/*
+ * Handle eventlog session when switching TCP stacks.
+ * - If the new stack has a provider and we have no session, create one.
+ * If the user previously set TCP_EVENTLOG (TF2_EVENTLOG_ENABLED),
+ * honor it.
+ * - If the new stack has no provider and we have a session, destroy it.
+ * Caller must hold inp lock.
+ */
+void
+tcp_ensure_eventlog_session_on_switch(struct tcpcb *tp,
+ struct tcp_function_block *tfb)
+{
+ struct tcp_eventlog_session_create create_payload;
+
+ INP_WLOCK_ASSERT(tptoinpcb(tp));
+ if (tfb->tfb_eventlog_provider != NULL) {
+ /* New stack has a provider: create a session if needed. */
+ if (tp->t_eventlog_session != NULL) {
+ /* Mid-connection swap between two eventlog stacks */
+ TCP_EVENTLOG_TCP_STACK_CHANGE_LOG(
+ tp->t_eventlog_session,
+ tfb->tfb_tcp_block_name);
+ return;
+ }
+ create_payload.tp = tp;
+ tp->t_eventlog_session = eventlog_session_create(
+ tfb->tfb_eventlog_provider,
+ tptoinpcb(tp)->inp_gencnt,
+ false,
+ &create_payload,
+ sizeof(create_payload));
+ if (tp->t_flags2 & TF2_EVENTLOG_ENABLED) {
+ eventlog_session_set_enabled(
+ tp->t_eventlog_session, 1);
+ tcp_eventlog_dump_session(tp,
+ tcp_log_id_str(tp));
+ }
+ } else {
+ /* New stack has no provider: end any existing session. */
+ if (tp->t_eventlog_session != NULL)
+ TCP_EVENTLOG_TCP_STACK_CHANGE_LOG(
+ tp->t_eventlog_session,
+ tfb->tfb_tcp_block_name);
+ eventlog_session_destroy(tp->t_eventlog_session);
+ tp->t_eventlog_session = NULL;
+ }
+}
+
+/*
+ * Emit a CONN_PARAMS event derived entirely from tp. Callers must invoke
+ * this only after the handshake has settled the negotiated parameters
+ * (window scales, MSS, ECN/SACK/etc). Reads the negotiated receive window
+ * scale from tp->request_r_scale (not tp->rcv_scale): the passive-open path
+ * emits before tp->rcv_scale is installed, and after the handshake completes
+ * tp->rcv_scale = tp->request_r_scale, so request_r_scale is the canonical
+ * source in both states.
+ */
+void
+tcp_eventlog_conn_params(struct tcpcb *tp)
+{
+ uint8_t flags = 0;
+ const char *cc_name = "";
+ const char *stack_name = "";
+
+ KASSERT(tp != NULL, ("tcp_eventlog_conn_params: tp == NULL"));
+
+ if (!TCP_EVENTLOG_CONN_PARAMS_ENABLED(tp->t_eventlog_session))
+ return;
+
+ if (tp->t_flags & TF_SACK_PERMIT)
+ flags |= TCP_EVENTLOG_FLAG_CP_SACK_PERMIT;
+ if (tp->t_flags & TF_RCVD_TSTMP)
+ flags |= TCP_EVENTLOG_FLAG_CP_TIMESTAMPS;
+ if (tp->t_flags2 & TF2_ECN_PERMIT)
+ flags |= TCP_EVENTLOG_FLAG_CP_ECN;
+ if (tp->t_flags2 & TF2_ACE_PERMIT)
+ flags |= TCP_EVENTLOG_FLAG_CP_ACE;
+ if (tp->t_flags & TF_FASTOPEN)
+ flags |= TCP_EVENTLOG_FLAG_CP_FASTOPEN;
+ if (tp->t_flags & TF_SIGNATURE)
+ flags |= TCP_EVENTLOG_FLAG_CP_SIGNATURE;
+ /*
+ * Window scaling is "negotiated" only when both sides exchanged the
+ * option in the SYN/SYN-ACK; without this flag, snd_wscale=rcv_wscale=0
+ * is ambiguous between "shift 0 negotiated" and "wscale absent".
+ */
+ if ((tp->t_flags & (TF_REQ_SCALE | TF_RCVD_SCALE)) ==
+ (TF_REQ_SCALE | TF_RCVD_SCALE))
+ flags |= TCP_EVENTLOG_FLAG_CP_WINDOW_SCALE;
+
+ if (CC_ALGO(tp) != NULL)
+ cc_name = CC_ALGO(tp)->name;
+ if (tp->t_fb != NULL)
+ stack_name = tp->t_fb->tfb_tcp_block_name;
+
+ TCP_EVENTLOG_CONN_PARAMS_LOG_ALWAYS(tp->t_eventlog_session,
+ tp->iss, tp->irs, tp->t_maxseg, tp->snd_scale, tp->request_r_scale,
+ flags, cc_name, stack_name);
+}
+
+/*
+ * Emit eventlog events for a TCP connection's current state (SESSION_CREATE,
+ * CONN_SET_IP, LOG_ID). Caller must hold the inp lock.
+ */
+void
+tcp_eventlog_dump_session(struct tcpcb *tp, const char *log_id)
+{
+ struct inpcb *inp;
+
+ if (tp == NULL || tp->t_eventlog_session == NULL)
+ return;
+
+ inp = tptoinpcb(tp);
+
+ TCP_EVENTLOG_SESSION_CREATE_LOG(tp->t_eventlog_session, tp);
+
+ /*
+ * Only emit the address once the local port is assigned; otherwise a
+ * socket that enables logging before bind()/connect() would log an
+ * all-zero 4-tuple, and a later connect() would log a second one.
+ * A zero foreign port is fine: that is a listening socket.
+ */
+#ifdef INET
+ if ((inp->inp_vflag & INP_IPV4) && inp->inp_lport != 0) {
+ TCP_EVENTLOG_CONN_SET_IP_V4_LOG(tp->t_eventlog_session,
+ inp->inp_laddr, inp->inp_lport,
+ inp->inp_faddr, inp->inp_fport);
+ }
+#endif
+#ifdef INET6
+ if ((inp->inp_vflag & INP_IPV6) && inp->inp_lport != 0) {
+ TCP_EVENTLOG_CONN_SET_IP_V6_LOG(tp->t_eventlog_session,
+ inp->inp_inc.inc6_laddr, inp->inp_lport,
+ inp->inp_inc.inc6_faddr, inp->inp_fport);
+ }
+#endif
+
+ if (log_id != NULL)
+ TCP_EVENTLOG_LOG_ID_LOG(tp->t_eventlog_session, log_id);
+
+ if (tp->iss != 0 || tp->irs != 0)
+ tcp_eventlog_conn_params(tp);
+
+ if (tp->t_owning_pid != 0)
+ TCP_EVENTLOG_CONN_OWNER_LOG(tp->t_eventlog_session,
+ tp->t_owning_pid);
+}
+
+/*
+ * Provider dump callback: emit current state for all TCP eventlog sessions.
+ * Invoked inline on the subscribing thread; uses normal event write macros
+ * which are automatically routed to the requesting subscriber.
+ */
+void
+tcp_eventlog_dump_state(struct eventlog_provider *provider __unused,
+ void *arg __unused)
+{
+ VNET_ITERATOR_DECL(vnet_iter);
+ struct inpcb *inp;
+
+ VNET_LIST_RLOCK();
+ VNET_FOREACH(vnet_iter) {
+ CURVNET_SET(vnet_iter);
+ struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo,
+ INPLOOKUP_RLOCKPCB);
+
+ while ((inp = inp_next(&inpi)) != NULL) {
+ struct tcpcb *tp = intotcpcb(inp);
+ if (tp->t_eventlog_session == NULL)
+ continue;
+ tcp_eventlog_dump_session(tp, tcp_log_id_str(tp));
+ }
+ CURVNET_RESTORE();
+ }
+ VNET_LIST_RUNLOCK();
+}
+
+void
+tcp_eventlog_on_log_id_set(struct tcpcb *tp, const char *log_id)
+{
+ struct eventlog_provider *prov = tp->t_fb->tfb_eventlog_provider;
+
+ if (eventlog_provider_get_default(prov) == 0) {
+ bool can_enable = true;
+ if (tcp_eventlog_log_id_limit >= 0) {
+ int old;
+ do {
+ old = atomic_load_acq_int(
+ &tcp_eventlog_log_id_limit);
+ if (old <= 0) {
+ can_enable = false;
+ break;
+ }
+ } while (!atomic_cmpset_int(
+ &tcp_eventlog_log_id_limit,
+ old, old - 1));
+ }
+ if (can_enable) {
+ eventlog_session_set_enabled(
+ tp->t_eventlog_session, 1);
+ tcp_eventlog_dump_session(tp, log_id);
+ }
+ } else {
+ tcp_eventlog_dump_session(tp, log_id);
+ }
+}
+
+void
+tcp_eventlog_on_log_id_clear(struct tcpcb *tp)
+{
+ if (tp != NULL &&
+ eventlog_provider_get_default(tp->t_fb->tfb_eventlog_provider) == 0)
+ eventlog_session_set_enabled(tp->t_eventlog_session, 0);
+}
+
+void
+tcp_eventlog_sendfile(struct socket *so, off_t offset, size_t nbytes,
+ int flags)
+{
+ struct inpcb *inp;
+ struct tcpcb *tp;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp_eventlog_sendfile: inp == NULL"));
+
+ /* quick check to see if logging is enabled for this connection */
+ tp = intotcpcb(inp);
+ if (tp->t_flags & TF_DISCONNECTED)
+ return;
+
+ INP_WLOCK(inp);
+ /* double check log state now that we have the lock */
+ if (tp->t_flags & TF_DISCONNECTED) {
+ INP_WUNLOCK(inp);
+ return;
+ }
+ TCP_EVENTLOG_SENDFILE_LOG(tp->t_eventlog_session,
+ offset, nbytes, flags);
+ INP_WUNLOCK(inp);
+}
+
void
tcp_switch_back_to_default(struct tcpcb *tp)
{
@@ -532,6 +822,7 @@
/* Now set in all the pointers */
tp->t_fb = tfb;
tp->t_fb_ptr = ptr;
+ tcp_ensure_eventlog_session_on_switch(tp, tfb);
return;
}
/*
@@ -566,6 +857,7 @@
/* And set in the pointers to the new */
tp->t_fb = tfb;
tp->t_fb_ptr = ptr;
+ tcp_ensure_eventlog_session_on_switch(tp, tfb);
}
static bool
@@ -1499,6 +1791,12 @@
/* Setup the tcp function block list */
TAILQ_INIT(&t_functions);
rw_init(&tcp_function_lock, "tcp_func_lock");
+ static const struct eventlog_provider_config tcp_eventlog_cfg = {
+ .dump_callback = tcp_eventlog_dump_state,
+ .default_changed = tcp_eventlog_default_changed,
+ };
+ tcp_def_funcblk.tfb_eventlog_provider = eventlog_provider_create("tcp",
+ &tcp_eventlog_cfg);
register_tcp_functions(&tcp_def_funcblk, M_WAITOK);
sx_init(&tcpoudp_lock, "TCP over UDP configuration");
#ifdef TCP_BLACKBOX
@@ -2094,6 +2392,20 @@
if (flags & TH_RST)
TCP_PROBE5(accept__refused, NULL, NULL, m, tp, nth);
lgb = NULL;
+ if (tp != NULL) {
+ TCP_EVENTLOG_OUT_LOG(tp->t_eventlog_session,
+ ntohl(nth->th_seq),
+ ntohl(nth->th_ack),
+ tcp_get_flags(nth),
+ ntohs(nth->th_win),
+ (uint32_t)ntohs(nth->th_win) << tp->rcv_scale,
+ 0,
+ tp->snd_cwnd,
+ tp->t_state,
+ nth->th_off,
+ ntohs(nth->th_sum),
+ ntohs(nth->th_urp));
+ }
if ((tp != NULL) && tcp_bblogging_on(tp)) {
if (INP_WLOCKED(inp)) {
union tcp_log_stackspecific log;
@@ -2258,6 +2570,12 @@
return (NULL);
}
tp->t_fb = listening_tcb->t_fb;
+ /*
+ * Inherit owning PID from the listening socket so that
+ * passively-accepted connections are attributed to the
+ * process that created the listener.
+ */
+ tp->t_owning_pid = listening_tcb->t_owning_pid;
} else {
tp->t_fb = V_tcp_func_set_ptr;
}
@@ -2362,9 +2680,24 @@
/* Initialize the per-TCPCB log data. */
tcp_log_tcpcbinit(tp);
#endif
+
+ /* Create per-connection eventlog session. Session ID is inp_gencnt. */
+ {
+ struct tcp_eventlog_session_create create_payload;
+
+ create_payload.tp = tp;
+ tp->t_eventlog_session = eventlog_session_create(
+ tp->t_fb->tfb_eventlog_provider,
+ inp->inp_gencnt,
+ false,
+ &create_payload,
+ sizeof(create_payload));
+ }
+
tp->t_pacing_rate = -1;
if (tp->t_fb->tfb_tcp_fb_init) {
if ((*tp->t_fb->tfb_tcp_fb_init)(tp, &tp->t_fb_ptr)) {
+ eventlog_session_destroy(tp->t_eventlog_session);
if (CC_ALGO(tp)->cb_destroy != NULL)
CC_ALGO(tp)->cb_destroy(&tp->t_ccv);
CC_DATA(tp) = NULL;
@@ -2472,6 +2805,12 @@
tcp_log_tcpcbfini(tp);
#endif
+ /*
+ * Destroy per-connection eventlog session (emits SESSION_END).
+ */
+ eventlog_session_destroy(tp->t_eventlog_session);
+ tp->t_eventlog_session = NULL;
+
/*
* If we got enough samples through the srtt filter,
* save the rtt and rttvar in the routing entry.
@@ -5016,6 +5355,13 @@
l->tlb_flex2 = option_val;
}
}
+ if (err != 0) {
+ TCP_EVENTLOG_SOCKET_OPT_ERR_LOG(tp->t_eventlog_session,
+ option_num, err);
+ } else {
+ TCP_EVENTLOG_SOCKET_OPT_UINT32_LOG(tp->t_eventlog_session,
+ option_num, option_val);
+ }
}
uint32_t
diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c
--- a/sys/netinet/tcp_syncache.c
+++ b/sys/netinet/tcp_syncache.c
@@ -92,6 +92,7 @@
#ifdef TCP_BLACKBOX
#include <netinet/tcp_log_buf.h>
#endif
+#include <eventlog/tcp_eventlog.h>
#ifdef TCP_OFFLOAD
#include <netinet/toecore.h>
#endif
@@ -852,6 +853,14 @@
/* Override flowlabel from in6_pcbconnect. */
inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
inp->inp_flow |= sc->sc_flowlabel;
+
+ if (inp->inp_vflag & INP_IPV6) {
+ TCP_EVENTLOG_CONN_SET_IP_V6_LOG(tp->t_eventlog_session,
+ inp->inp_inc.inc6_laddr,
+ inp->inp_lport,
+ inp->inp_inc.inc6_faddr,
+ inp->inp_fport);
+ }
}
#endif /* INET6 */
#if defined(INET) && defined(INET6)
@@ -876,6 +885,14 @@
error = in_pcbconnect(inp, &sin, thread0.td_ucred);
if (error != 0)
goto abort;
+
+ if (inp->inp_vflag & INP_IPV4) {
+ TCP_EVENTLOG_CONN_SET_IP_V4_LOG(tp->t_eventlog_session,
+ inp->inp_laddr,
+ inp->inp_lport,
+ inp->inp_faddr,
+ inp->inp_fport);
+ }
}
#endif /* INET */
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
@@ -961,6 +978,9 @@
*/
tcp_mss(tp, sc->sc_peer_mss);
+ /* Log once all handshake parameters are set. */
+ tcp_eventlog_conn_params(tp);
+
/*
* If the SYN,ACK was retransmitted, indicate that CWND to be
* limited to one segment in cc_conn_init().
@@ -999,6 +1019,17 @@
tcp_log_state_change(tp, tcp_get_bblog_state(sototcpcb(lso)));
}
#endif
+ /*
+ * Inherit eventlog enable state from the listener so that connections
+ * accepted on an eventlog-enabled listener are themselves observed.
+ * Enable the connection's session and dump its current state so
+ * subscribers see a complete view from the start.
+ */
+ if ((sototcpcb(lso)->t_flags2 & TF2_EVENTLOG_ENABLED) != 0) {
+ tp->t_flags2 |= TF2_EVENTLOG_ENABLED;
+ eventlog_session_set_enabled(tp->t_eventlog_session, 1);
+ tcp_eventlog_dump_session(tp, tcp_log_id_str(tp));
+ }
/*
* Copy and activate timers.
*/
diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c
--- a/sys/netinet/tcp_timer.c
+++ b/sys/netinet/tcp_timer.c
@@ -65,6 +65,7 @@
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_log_buf.h>
+#include <eventlog/tcp_eventlog.h>
#include <netinet/tcp_seq.h>
#include <netinet/cc/cc.h>
#ifdef INET6
@@ -626,6 +627,13 @@
tcp_free_sackholes(tp);
}
TCPSTAT_INC(tcps_rexmttimeo);
+ TCP_EVENTLOG_RTO_LOG(tp->t_eventlog_session,
+ tp->t_rxtshift,
+ tp->t_rttlow,
+ tp->t_srtt >> TCP_RTT_SHIFT,
+ tp->t_maxseg,
+ tp->snd_max - tp->snd_una,
+ 0);
if ((tp->t_state == TCPS_SYN_SENT) ||
(tp->t_state == TCPS_SYN_RECEIVED))
rexmt = tcp_rexmit_initial * tcp_backoff[tp->t_rxtshift];
@@ -822,6 +830,15 @@
return (rv);
}
+/* Map a tt_which onto the eventlog timer_reason enum. */
+static uint8_t const tcp_eventlog_timer_reason[TT_N] = {
+ [TT_REXMT] = TCP_EVENTLOG_TIMER_REASON_RXT,
+ [TT_PERSIST] = TCP_EVENTLOG_TIMER_REASON_PERSIST,
+ [TT_KEEP] = TCP_EVENTLOG_TIMER_REASON_KEEPALIVE,
+ [TT_2MSL] = TCP_EVENTLOG_TIMER_REASON_TWOMSL,
+ [TT_DELACK] = TCP_EVENTLOG_TIMER_REASON_DELAYACK,
+};
+
static void
tcp_bblog_timer(struct tcpcb *tp, tt_which which, tt_what what, uint32_t ticks)
{
@@ -886,8 +903,12 @@
tp->t_precisions[which] = 0;
tcp_bblog_timer(tp, which, TT_PROCESSING, 0);
+ TCP_EVENTLOG_TIMER_FIRED_LOG(tp->t_eventlog_session,
+ tcp_eventlog_timer_reason[which]);
if (tcp_timersw[which](tp)) {
tcp_bblog_timer(tp, which, TT_PROCESSED, 0);
+ TCP_EVENTLOG_TIMER_PROCESSED_LOG(tp->t_eventlog_session,
+ tcp_eventlog_timer_reason[which]);
if ((which = tcp_timer_next(tp, &precision)) != TT_N) {
MPASS(tp->t_state > TCPS_CLOSED);
callout_reset_sbt_on(&tp->t_callout,
@@ -920,9 +941,14 @@
what = TT_STARTING;
callout_when(tick_sbt * delta, 0, C_HARDCLOCK,
&tp->t_timers[which], &tp->t_precisions[which]);
+ TCP_EVENTLOG_TIMER_START_LOG(tp->t_eventlog_session,
+ (((uint64_t)delta * 1000000) + (hz - 1)) / hz,
+ tcp_eventlog_timer_reason[which]);
} else {
what = TT_STOPPING;
tp->t_timers[which] = SBT_MAX;
+ TCP_EVENTLOG_TIMER_CANCEL_LOG(tp->t_eventlog_session,
+ tcp_eventlog_timer_reason[which]);
}
tcp_bblog_timer(tp, which, what, delta);
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -92,6 +92,7 @@
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_log_buf.h>
+#include <eventlog/tcp_eventlog.h>
#include <netinet/tcpip.h>
#include <netinet/cc/cc.h>
#include <netinet/tcp_fastopen.h>
@@ -177,6 +178,12 @@
goto out;
}
tp->t_state = TCPS_CLOSED;
+ /*
+ * Record the PID of the process that opened this socket so the
+ * connection's eventlog can identify the owning application.
+ */
+ if (td != NULL && td->td_proc != NULL)
+ tp->t_owning_pid = td->td_proc->p_pid;
tcp_bblog_pru(tp, PRU_ATTACH, error);
INP_WUNLOCK(inp);
TCPSTATES_INC(TCPS_CLOSED);
@@ -897,6 +904,7 @@
u_int8_t incflagsav;
u_char vflagsav;
bool restoreflags;
+ u_int appended = 0;
INP_WLOCK(inp);
if (__predict_false(tp->t_flags & TF_DISCONNECTED)) {
@@ -1023,7 +1031,9 @@
if (!(flags & PRUS_OOB)) {
if (tp->t_acktime == 0)
tp->t_acktime = ticks;
+ appended = sbavail(&so->so_snd);
sbappendstream(&so->so_snd, m, flags);
+ appended = sbavail(&so->so_snd) - appended;
m = NULL;
if (nam && tp->t_state < TCPS_SYN_SENT) {
KASSERT(tp->t_state == TCPS_CLOSED,
@@ -1109,7 +1119,9 @@
*/
if (tp->t_acktime == 0)
tp->t_acktime = ticks;
+ appended = sbavail(&so->so_snd);
sbappendstream_locked(&so->so_snd, m, flags);
+ appended = sbavail(&so->so_snd) - appended;
SOCK_SENDBUF_UNLOCK(so);
m = NULL;
if (nam && tp->t_state < TCPS_SYN_SENT) {
@@ -1162,6 +1174,9 @@
&inp->inp_socket->so_snd,
TCP_LOG_USERSEND, error,
0, NULL, false);
+ TCP_EVENTLOG_USER_SEND_LOG(tp->t_eventlog_session,
+ error, flags, tp->t_state,
+ sbused(&so->so_snd), sbavail(&so->so_snd), appended);
out:
/*
@@ -1428,6 +1443,14 @@
if (error != 0)
return (error);
+ if (inp->inp_vflag & INP_IPV4) {
+ TCP_EVENTLOG_CONN_SET_IP_V4_LOG(tp->t_eventlog_session,
+ inp->inp_laddr,
+ inp->inp_lport,
+ inp->inp_faddr,
+ inp->inp_fport);
+ }
+
/* set the hash on the connection */
rss_proto_software_hash_v4(inp->inp_faddr, inp->inp_laddr,
inp->inp_fport, inp->inp_lport, IPPROTO_TCP,
@@ -1475,6 +1498,14 @@
if (error != 0)
return (error);
+ if (inp->inp_vflag & INP_IPV6) {
+ TCP_EVENTLOG_CONN_SET_IP_V6_LOG(tp->t_eventlog_session,
+ inp->inp_inc.inc6_laddr,
+ inp->inp_lport,
+ inp->inp_inc.inc6_faddr,
+ inp->inp_fport);
+ }
+
/* set the hash on the connection */
rss_proto_software_hash_v6(&inp->in6p_faddr,
&inp->in6p_laddr, inp->inp_fport, inp->inp_lport, IPPROTO_TCP,
@@ -1753,6 +1784,7 @@
*/
tp->t_fb = blk;
tp->t_fb_ptr = ptr;
+ tcp_ensure_eventlog_session_on_switch(tp, blk);
#ifdef TCP_OFFLOAD
if (tp->t_flags & TF_TOE) {
tcp_offload_ctloutput(tp, sopt->sopt_dir,
@@ -1946,6 +1978,8 @@
memcpy(&tp->t_ccv, &cc_mem, sizeof(struct cc_var));
/* Now attach the new, which takes a reference */
cc_attach(tp, algo);
+ TCP_EVENTLOG_CC_ALGO_CHANGE_LOG(tp->t_eventlog_session,
+ algo->name);
/* Ok now are we where we have gotten past any conn_init? */
if (TCPS_HAVEESTABLISHED(tp->t_state) && (CC_ALGO(tp)->conn_init != NULL)) {
/* Yep run the connection init for the new CC */
@@ -2148,6 +2182,8 @@
} else {
tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT;
}
+ TCP_EVENTLOG_MSS_LOG(tp->t_eventlog_session,
+ tp->t_maxseg);
} else
error = EINVAL;
goto unlock_and_done;
@@ -2200,6 +2236,24 @@
INP_WUNLOCK(inp);
break;
+ case TCP_EVENTLOG:
+ INP_WUNLOCK(inp);
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ return (error);
+ INP_WLOCK_RECHECK(inp);
+ if (optval)
+ tp->t_flags2 |= TF2_EVENTLOG_ENABLED;
+ else
+ tp->t_flags2 &= ~TF2_EVENTLOG_ENABLED;
+ eventlog_session_set_enabled(
+ tp->t_eventlog_session, optval);
+ if (optval)
+ tcp_eventlog_dump_session(tp,
+ tcp_log_id_str(tp));
+ goto unlock_and_done;
+
#ifdef KERN_TLS
case TCP_TXTLS_ENABLE:
INP_WUNLOCK(inp);
@@ -2547,6 +2601,16 @@
error = EINVAL;
break;
#endif
+ case TCP_EVENTLOG:
+ if (tp->t_eventlog_session != NULL)
+ optval = eventlog_session_is_enabled(
+ tp->t_eventlog_session);
+ else
+ optval = (tp->t_flags2 &
+ TF2_EVENTLOG_ENABLED) ? 1 : 0;
+ INP_WUNLOCK(inp);
+ error = sooptcopyout(sopt, &optval, sizeof(optval));
+ break;
#ifdef KERN_TLS
case TCP_TXTLS_MODE:
error = ktls_get_tx_mode(so, &optval);
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -444,6 +444,9 @@
struct tcp_log_stailq t_logs; /* Log buffer */
struct tcp_log_id_node *t_lin;
struct tcp_log_id_bucket *t_lib;
+ struct eventlog_session *t_eventlog_session;
+ /* Per-connection eventlog session */
+ pid_t t_owning_pid; /* PID of process that opened this conn */
const char *t_output_caller; /* Function that called tcp_output */
struct statsblob *t_stats; /* Per-connection stats */
/* Should these be a pointer to the arrays or an array? */
@@ -624,6 +627,7 @@
volatile uint32_t tfb_refcnt;
uint32_t tfb_flags;
uint8_t tfb_id;
+ struct eventlog_provider *tfb_eventlog_provider;
};
/* Maximum number of names each TCP function block can be registered with. */
@@ -855,6 +859,7 @@
#define TF2_PROC_SACK_PROHIBIT 0x00100000 /* Due to small MSS size do not process sack's */
#define TF2_IPSEC_TSO 0x00200000 /* IPSEC + TSO supported */
#define TF2_NO_ISS_CHECK 0x00400000 /* Don't check SEG.ACK against ISS */
+#define TF2_EVENTLOG_ENABLED 0x02000000 /* User requested eventlog enabled */
/* t_flags2 description for use with printf(9) %b identifier. */
#define TF2_BITS "\20" \
@@ -869,7 +874,7 @@
"\21TF2_SUPPORTS_MBUFQ\22TF2_MBUF_QUEUE_READY" \
"\23TF2_DONT_SACK_QUEUE\24TF2_CANNOT_DO_ECN" \
"\25TF2_PROC_SACK_PROHIBIT\26TF2_IPSEC_TSO" \
- "\27TF2_NO_ISS_CHECK"
+ "\27TF2_NO_ISS_CHECK\32TF2_EVENTLOG_ENABLED"
/*
* Structure to hold TCP options that are only used during segment
@@ -1276,6 +1281,7 @@
#ifdef SYSCTL_DECL
SYSCTL_DECL(_net_inet_tcp);
SYSCTL_DECL(_net_inet_tcp_sack);
+SYSCTL_DECL(_net_inet_tcp_eventlog);
MALLOC_DECLARE(M_TCPLOG);
#endif
@@ -1441,6 +1447,17 @@
int find_tcp_function_alias(struct tcp_function_block *blk, struct tcp_function_set *fs);
uint32_t tcp_get_srtt(struct tcpcb *tp, int granularity);
void tcp_switch_back_to_default(struct tcpcb *tp);
+void tcp_ensure_eventlog_session_on_switch(struct tcpcb *tp,
+ struct tcp_function_block *tfb);
+void tcp_eventlog_dump_state(struct eventlog_provider *provider, void *arg);
+void tcp_eventlog_default_changed(struct eventlog_provider *provider,
+ int value, void *arg);
+void tcp_eventlog_dump_session(struct tcpcb *tp, const char *log_id);
+void tcp_eventlog_on_log_id_set(struct tcpcb *tp, const char *log_id);
+void tcp_eventlog_on_log_id_clear(struct tcpcb *tp);
+void tcp_eventlog_sendfile(struct socket *so, off_t offset, size_t nbytes,
+ int flags);
+void tcp_eventlog_conn_params(struct tcpcb *tp);
struct tcp_function_block *
find_and_ref_tcp_fb(struct tcp_function_block *fs);
int tcp_default_ctloutput(struct tcpcb *tp, struct sockopt *sopt);
diff --git a/tests/sys/kern/elog_test.py b/tests/sys/kern/elog_test.py
--- a/tests/sys/kern/elog_test.py
+++ b/tests/sys/kern/elog_test.py
@@ -4,27 +4,430 @@
# SPDX-License-Identifier: BSD-2-Clause
#
-"""ATF tests for the elog(1) userspace utility.
+"""End-to-end ATF tests for the elog(1) userspace utility.
-Smoke tests for the elog binary CLI surface. These catch packaging
-regressions (missing binary, broken option parser, broken capture-file
-reader) without requiring /dev/eventlog or any provider to be present.
+Validates the full path: elog binary -> /dev/eventlog -> kernel eventlog
+framework. Complements kern_eventlog_test (which exercises kernel-internal
+APIs via a ktest module) by covering the real character-device + IOCTL
+surface.
-End-to-end coverage of the device interface and individual providers
-lives with the providers themselves; the framework's own kernel-side
-tests are in kern_eventlog_test.py.
+Most cases require root because /dev/eventlog rejects jailed callers
+(kern_eventlog.c eventlog_dev_open) and the CREATE IOCTL needs to allocate
+per-CPU buffers.
"""
+import os
+import socket
+import struct
import subprocess
+import time
+from collections import namedtuple
+from contextlib import contextmanager
from pathlib import Path
import pytest
from atf_python.utils import BaseTest
ELOG = "/usr/bin/elog"
+EVENTLOG_DEV = "/dev/eventlog"
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _sysctl_get(name):
+ """Return the current value of sysctl `name` as a string, or None if
+ the OID does not exist."""
+ r = subprocess.run(
+ ["sysctl", "-n", name],
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ if r.returncode != 0:
+ return None
+ return r.stdout.decode().strip()
+
+
+def _sysctl_set(name, value):
+ """Set sysctl `name` to `value`. Raises CalledProcessError on failure."""
+ subprocess.run(
+ ["sysctl", f"{name}={value}"],
+ stdout=subprocess.DEVNULL, stderr=subprocess.PIPE,
+ check=True)
+
+
+@contextmanager
+def sysctl_override(name, value):
+ """Save current value of sysctl `name`, set it to `value`, then
+ restore on exit. Skips the test if the OID is missing or the new
+ value is rejected."""
+ old = _sysctl_get(name)
+ if old is None:
+ pytest.skip(f"sysctl {name} not present")
+ try:
+ _sysctl_set(name, value)
+ except subprocess.CalledProcessError as e:
+ pytest.skip(f"cannot set {name}={value}: "
+ f"{e.stderr.decode().strip()}")
+ try:
+ yield old
+ finally:
+ # Restore unconditionally; ignore errors so cleanup can't mask
+ # the original assertion failure.
+ subprocess.run(
+ ["sysctl", f"{name}={old}"],
+ stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=False)
+
+
+def _require_eventlog_dev():
+ if not Path(EVENTLOG_DEV).is_char_device():
+ pytest.skip(f"{EVENTLOG_DEV} not present")
+
+
+def _pick_port(salt=0):
+ """Pseudo-random high port; deterministic across reruns of the same
+ pytest invocation but varies with the test process."""
+ return 20000 + ((os.getpid() + salt) % 30000)
+
+
+def _addr_for(family):
+ return "::1" if family == socket.AF_INET6 else "127.0.0.1"
+
+
+@contextmanager
+def loopback_pair(family, port, linger0=False):
+ """Open a listener+connected client pair on `(addr, port)` over loopback,
+ yield (server-accepted-conn, client) sockets, and close all three on
+ exit. Returns once the three-way handshake has completed (accept()
+ returns).
+
+ If linger0=True, the client is configured with SO_LINGER {1, 0} so
+ that close() emits a TCP RST and the PCB is reclaimed immediately
+ instead of going through TIME_WAIT.
+ """
+ addr = _addr_for(family)
+ srv = socket.socket(family, socket.SOCK_STREAM)
+ srv.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+ cli = socket.socket(family, socket.SOCK_STREAM)
+ if linger0:
+ # struct linger { int l_onoff; int l_linger; }
+ cli.setsockopt(socket.SOL_SOCKET, socket.SO_LINGER,
+ struct.pack("ii", 1, 0))
+ conn = None
+ try:
+ srv.bind((addr, port))
+ srv.listen(1)
+ cli.connect((addr, port))
+ conn, _ = srv.accept()
+ yield conn, cli
+ finally:
+ for s in (conn, cli, srv):
+ if s is not None:
+ try:
+ s.close()
+ except OSError:
+ pass
+
+
+def loopback_oneshot(family, port, payload):
+ """Drive a one-shot TCP loopback exchange and return what the server
+ received. Closes both ends before returning."""
+ addr = _addr_for(family)
+ with socket.socket(family, socket.SOCK_STREAM) as srv:
+ srv.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+ srv.bind((addr, port))
+ srv.listen(1)
+ with socket.socket(family, socket.SOCK_STREAM) as cli:
+ cli.connect((addr, port))
+ conn, _ = srv.accept()
+ try:
+ cli.sendall(payload)
+ cli.shutdown(socket.SHUT_WR)
+ data = b""
+ while True:
+ chunk = conn.recv(4096)
+ if not chunk:
+ break
+ data += chunk
+ return data
+ finally:
+ conn.close()
+
+
+@contextmanager
+def elog_capture(*args, capture_path=None):
+ """Start `elog -s [args] [-o capture_path]` in the background. The
+ caller supplies all `-c <provider>` subscriptions and any other
+ flags. If `capture_path` is None no `-o` is passed (live text mode
+ on stdout). Yields the Popen handle. Sends SIGTERM and reaps on
+ exit; asserts a clean (rc=0) shutdown.
+ """
+ cmd = [ELOG, "-s", *args]
+ if capture_path is not None:
+ cmd += ["-o", str(capture_path)]
+ proc = subprocess.Popen(
+ cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ # Give elog a tick to issue CREATE/GET_PROVIDERS before the caller
+ # drives traffic.
+ time.sleep(1)
+ if proc.poll() is not None:
+ err = proc.stderr.read().decode()
+ raise AssertionError(f"elog exited early: {err}")
+ try:
+ yield proc
+ finally:
+ if proc.poll() is None:
+ proc.terminate()
+ try:
+ _, err = proc.communicate(timeout=10)
+ except subprocess.TimeoutExpired:
+ proc.kill()
+ _, err = proc.communicate()
+ if proc.returncode != 0:
+ raise AssertionError(
+ f"elog exited rc={proc.returncode}; "
+ f"stderr={err.decode()}")
+
+
+def decode_capture(path):
+ """Run `elog -e -r <path>` and return its stdout."""
+ return subprocess.check_output(
+ [ELOG, "-e", "-r", str(path)], text=True)
+
+
+def decode_capture_or_empty(path):
+ """Like decode_capture, but if the binary capture file is 0 bytes
+ (no events were ever recorded, so elog never wrote the header)
+ return "". elog(1) rejects an empty file with `bad magic number`,
+ which is correct behaviour for replay but is not a test failure
+ when we deliberately set up a filter that drops every event.
+ """
+ if Path(path).stat().st_size == 0:
+ return ""
+ return decode_capture(path)
+
+
+def _assert_in_decode(decoded, needle, label, *, head_chars=2000):
+ """Assert that `needle` is a substring of `decoded`. On failure include
+ the head of the decoded text so the failure is self-debugging in the
+ kyua report."""
+ if needle in decoded:
+ return
+ pytest.fail(
+ f"{label}: missing literal {needle!r} in decoded capture\n"
+ f"--- decoded.txt (first {head_chars} chars) ---\n"
+ f"{decoded[:head_chars]}\n--- end ---")
+
+
+# ---------------------------------------------------------------------------
+# elog binary-format decoder (in-process, no fork to elog -r)
+# ---------------------------------------------------------------------------
+#
+# On-disk layout (host byte order; we use "<" for amd64):
+#
+# elog_binary_header (40 bytes, __packed):
+# char magic[4] == "ELOG"
+# uint32_t version == 1
+# uint64_t capture_start microseconds since boot at capture start
+# uint64_t start_utc_us wall-clock UTC microseconds at capture start
+# uint64_t event_count total events written
+# uint64_t dropped_events events dropped (kernel buffer overflow)
+#
+# uint32_t provider_count
+# provider_count x eventlog_provider_info (34 bytes, __packed):
+# uint16_t provider_id
+# char name[EVENTLOG_PROVIDER_NAME_MAX = 32]
+#
+# stream of:
+# eventlog_event_header (32 bytes, naturally aligned, no padding):
+# uint16_t event_length total record size including header
+# uint16_t cpu
+# uint16_t provider_id
+# uint16_t RESERVED
+# uint64_t timestamp microseconds since boot
+# uint64_t session_id
+# uint32_t event_id
+# int32_t thread_id
+# payload event_length - 32 bytes
+#
+# References:
+# FreeBSD/usr.bin/elog/elog.c elog_binary_header / write_binary_event
+# FreeBSD/sys/sys/eventlog_subscriber.h
+# eventlog_event_header / eventlog_provider_info
+# FreeBSD/include/eventlog/tcp_eventlog_schema.src
+# STRUCT layouts for payload parsing
+
+ELOG_BINARY_MAGIC = b"ELOG"
+ELOG_BINARY_VERSION = 1
+EVENTLOG_PROVIDER_NAME_MAX = 32
+
+# Framework-reserved event IDs (FreeBSD/sys/sys/eventlog.h).
+EVENTLOG_SESSION_CREATE_ID = 0xFFFFFFFE
+EVENTLOG_SESSION_END_ID = 0xFFFFFFFF
+
+# tcp provider event IDs (tcp_eventlog_schema.src).
+TCP_EVENT_CONN_SET_IP_V4 = 0
+TCP_EVENT_CONN_SET_IP_V6 = 1
+TCP_EVENT_CONN_PARAMS = 3
+TCP_EVENT_MSS = 4
+TCP_EVENT_IN = 100
+TCP_EVENT_OUT = 101
+
+ELOG_BINARY_HEADER_FMT = "<4sIQQQQ"
+ELOG_BINARY_HEADER_SIZE = struct.calcsize(ELOG_BINARY_HEADER_FMT) # 40
+ELOG_PROVIDER_INFO_FMT = f"<H{EVENTLOG_PROVIDER_NAME_MAX}s"
+ELOG_PROVIDER_INFO_SIZE = struct.calcsize(ELOG_PROVIDER_INFO_FMT) # 34
+ELOG_EVENT_HEADER_FMT = "<HHHHQQIi"
+ELOG_EVENT_HEADER_SIZE = struct.calcsize(ELOG_EVENT_HEADER_FMT) # 32
+assert ELOG_BINARY_HEADER_SIZE == 40
+assert ELOG_PROVIDER_INFO_SIZE == 34
+assert ELOG_EVENT_HEADER_SIZE == 32
+
+ElogFileHeader = namedtuple(
+ "ElogFileHeader",
+ "capture_start start_utc_us event_count dropped_events")
+ElogEventHeader = namedtuple(
+ "ElogEventHeader",
+ "event_length cpu provider_id timestamp session_id event_id thread_id")
+
+
+def parse_elog_binary(path):
+ """Parse a complete elog binary capture file. Returns a tuple
+ `(file_header, providers_by_id, events)` where:
+ file_header ElogFileHeader
+ providers_by_id dict[int -> str], e.g. {1: "tcp"}
+ events list of (ElogEventHeader, payload_bytes)
+
+ Raises AssertionError on any structural problem (bad magic,
+ truncated event, payload-size mismatch with the recorded
+ event_length, etc.).
+ """
+ data = Path(path).read_bytes()
+ off = 0
+
+ if len(data) < ELOG_BINARY_HEADER_SIZE:
+ raise AssertionError(
+ f"capture {path!s}: only {len(data)} bytes, "
+ f"need ≥{ELOG_BINARY_HEADER_SIZE} for the file header")
+ (magic, version, capture_start, start_utc_us, event_count,
+ dropped_events) = struct.unpack_from(
+ ELOG_BINARY_HEADER_FMT, data, off)
+ off += ELOG_BINARY_HEADER_SIZE
+ if magic != ELOG_BINARY_MAGIC:
+ raise AssertionError(
+ f"capture {path!s}: bad magic {magic!r} "
+ f"(expected {ELOG_BINARY_MAGIC!r})")
+ if version != ELOG_BINARY_VERSION:
+ raise AssertionError(
+ f"capture {path!s}: bad version {version} "
+ f"(expected {ELOG_BINARY_VERSION})")
+ file_hdr = ElogFileHeader(
+ capture_start=capture_start, start_utc_us=start_utc_us,
+ event_count=event_count, dropped_events=dropped_events)
+
+ if off + 4 > len(data):
+ raise AssertionError(
+ f"capture {path!s}: truncated before provider_count")
+ (prov_count,) = struct.unpack_from("<I", data, off)
+ off += 4
+ # The kernel's EVENTLOG_MAX_PROVIDERS is 32; a value above that
+ # signals a corrupt header.
+ if prov_count > 32:
+ raise AssertionError(
+ f"capture {path!s}: implausible provider_count={prov_count}")
+ providers = {}
+ for i in range(prov_count):
+ if off + ELOG_PROVIDER_INFO_SIZE > len(data):
+ raise AssertionError(
+ f"capture {path!s}: truncated provider entry {i}")
+ prov_id, name_bytes = struct.unpack_from(
+ ELOG_PROVIDER_INFO_FMT, data, off)
+ off += ELOG_PROVIDER_INFO_SIZE
+ name = name_bytes.split(b"\0", 1)[0].decode("ascii")
+ providers[prov_id] = name
+
+ events = []
+ while off < len(data):
+ if off + ELOG_EVENT_HEADER_SIZE > len(data):
+ raise AssertionError(
+ f"capture {path!s}: trailing partial event header "
+ f"at off={off} (remaining={len(data) - off})")
+ (event_length, cpu, provider_id, _reserved, timestamp,
+ session_id, event_id, thread_id) = struct.unpack_from(
+ ELOG_EVENT_HEADER_FMT, data, off)
+ if event_length < ELOG_EVENT_HEADER_SIZE:
+ raise AssertionError(
+ f"capture {path!s}: event_length {event_length} "
+ f"< header size {ELOG_EVENT_HEADER_SIZE} at off={off}")
+ if off + event_length > len(data):
+ raise AssertionError(
+ f"capture {path!s}: event at off={off} runs past EOF "
+ f"(event_length={event_length}, "
+ f"remaining={len(data) - off})")
+ payload = bytes(data[off + ELOG_EVENT_HEADER_SIZE:
+ off + event_length])
+ events.append((
+ ElogEventHeader(
+ event_length=event_length, cpu=cpu,
+ provider_id=provider_id, timestamp=timestamp,
+ session_id=session_id, event_id=event_id,
+ thread_id=thread_id),
+ payload,
+ ))
+ off += event_length
+
+ return file_hdr, providers, events
+
+
+def parse_session_create_payload(payload):
+ """SESSION_CREATE payload (from STRUCT SESSION_CREATE in
+ tcp_eventlog_schema.src):
+ void *tp; // 8 bytes on amd64, naturally aligned
+ """
+ if len(payload) != 8:
+ raise AssertionError(
+ f"SESSION_CREATE payload size {len(payload)} != 8")
+ (tp,) = struct.unpack("<Q", payload)
+ return {"tp": tp}
+
+
+def parse_conn_set_ip_v4_payload(payload):
+ """CONN_SET_IP_V4 payload (from STRUCT CONN_SET_IP_V4 in
+ tcp_eventlog_schema.src), with C natural alignment:
+ struct in_addr src_addr; // 4 bytes (network byte order)
+ uint16_t src_port; // 2 bytes (network byte order)
+ // 2 bytes padding so the next in_addr is 4-byte aligned
+ struct in_addr dst_addr; // 4 bytes
+ uint16_t dst_port; // 2 bytes
+ // 2 bytes trailing padding so sizeof rounds up to 16
+ """
+ if len(payload) != 16:
+ raise AssertionError(
+ f"CONN_SET_IP_V4 payload size {len(payload)} != 16")
+ src_addr_bytes = payload[0:4]
+ (src_port_n,) = struct.unpack_from("<H", payload, 4)
+ dst_addr_bytes = payload[8:12]
+ (dst_port_n,) = struct.unpack_from("<H", payload, 12)
+ return {
+ # in_addr stores the address in network byte order; the raw
+ # 4 bytes are exactly what inet_ntoa expects.
+ "src_addr": socket.inet_ntoa(src_addr_bytes),
+ "dst_addr": socket.inet_ntoa(dst_addr_bytes),
+ "src_port": socket.ntohs(src_port_n),
+ "dst_port": socket.ntohs(dst_port_n),
+ }
+
+
+# ---------------------------------------------------------------------------
+# CLI smoke tests (no kernel touch)
+# ---------------------------------------------------------------------------
class TestElogCli(BaseTest):
+ """Cheap smoke tests for the elog binary CLI surface. These catch
+ packaging regressions (missing binary, broken option parser) without
+ needing /dev/eventlog or the kernel framework to be functional."""
+
@pytest.mark.require_progs(["elog"])
def test_help(self):
# usage() in elog.c calls exit(1), so -h is expected to fail.
@@ -65,3 +468,676 @@
[ELOG, "-r", str(bogus)], capture_output=True, text=True)
assert r.returncode == 1
assert "bad magic number" in r.stderr
+
+
+# ---------------------------------------------------------------------------
+# Tests that touch /dev/eventlog and the tcp eventlog provider
+# ---------------------------------------------------------------------------
+
+
+class TestElogCapture(BaseTest):
+ """End-to-end coverage: elog -> /dev/eventlog -> kernel framework.
+
+ Most of these pin net.inet.tcp.functions_default to "freebsd" for
+ the duration of the test. Eventlog wiring is per-stack
+ (tfb_eventlog_provider on the function-block); pinning to a stack
+ that is known to register the tcp eventlog provider keeps the
+ tests deterministic regardless of which stack the host happens to
+ have configured by default. Cases that need a different stack
+ skip cleanly if that stack is not available on the running image
+ (sysctl_override -> pytest.skip).
+ """
+
+ NEED_ROOT = True
+
+ @pytest.mark.require_user("root")
+ def test_dev_eventlog_present(self):
+ # Sanity: /dev/eventlog should exist as a character device on any
+ # kernel with the eventlog framework compiled in.
+ assert Path(EVENTLOG_DEV).is_char_device(), (
+ f"{EVENTLOG_DEV} not a character device")
+
+ @pytest.mark.require_user("root")
+ @pytest.mark.require_progs(["elog"])
+ @pytest.mark.timeout(30)
+ def test_capture_tcp_brief(self, tmp_path):
+ """elog -c tcp subscribes, captures briefly with no traffic, and
+ exits cleanly on SIGTERM."""
+ _require_eventlog_dev()
+ capture = tmp_path / "capture.elog"
+ # No traffic; the SIGTERM handler in elog.c sets done=true and
+ # the read loop returns 0 from main().
+ with elog_capture("-c", "tcp", capture_path=capture) as elog:
+ time.sleep(1)
+ # Stats output goes to stderr in a "[Stats]\n Providers: N" block.
+ # A successful subscribe always discovers at least one provider.
+ # (The elog_capture context manager already asserted rc==0.)
+ # We don't validate stats stderr here; the capture context manager
+ # would have raised if elog died.
+ assert elog.returncode == 0
+
+ @pytest.mark.require_user("root")
+ @pytest.mark.require_progs(["elog"])
+ @pytest.mark.timeout(60)
+ @pytest.mark.parametrize(
+ "stack,family",
+ [
+ pytest.param("freebsd", socket.AF_INET, id="freebsd_inet"),
+ pytest.param("freebsd", socket.AF_INET6, id="freebsd_inet6"),
+ ],
+ )
+ def test_capture_tcp_loopback(self, tmp_path, stack, family):
+ """Drive a loopback TCP exchange under the given stack and family,
+ then verify the captured eventlog contains the events we expect."""
+ _require_eventlog_dev()
+
+ capture = tmp_path / "capture.elog"
+ port = _pick_port()
+ payload = b"ping over loopback\n"
+
+ with sysctl_override("net.inet.tcp.functions_default", stack), \
+ sysctl_override("kern.eventlog.tcp.default", "1"):
+
+ sessions_created_before = int(_sysctl_get(
+ "kern.eventlog.tcp.sessions_created"))
+
+ with elog_capture("-c", "tcp", capture_path=capture):
+ # One-shot loopback exchange. Python's socket module
+ # makes the handshake/teardown deterministic - no nc
+ # process to wait for, no sockstat polling.
+ received = loopback_oneshot(family, port, payload)
+ assert payload.rstrip(b"\n") in received
+
+ # Let the kernel push remaining events and elog drain
+ # its read (the read path's tsleep also has a 1Hz timer).
+ time.sleep(2)
+
+ sessions_created_after = int(_sysctl_get(
+ "kern.eventlog.tcp.sessions_created"))
+
+ # If no new sessions were created, downstream content assertions
+ # are meaningless.
+ assert sessions_created_after > sessions_created_before, (
+ f"no new tcp sessions during exchange "
+ f"(stack={stack}, family={family.name})")
+
+ # --- Decode and validate ---
+ decoded = decode_capture(capture)
+ assert decoded.strip(), "decoded capture is empty"
+
+ if family == socket.AF_INET6:
+ ipver_label = "IPv6 connection"
+ loop_addr = "::1"
+ else:
+ ipver_label = "IPv4 connection"
+ loop_addr = "127.0.0.1"
+
+ # Each tuple is (substring, human label). On failure the helper
+ # dumps the head of the decoded capture.
+ expectations = [
+ ("[tcp]", "tcp provider tag"),
+ ("[SESSION_CREATE]", "SESSION_CREATE event"),
+ ("[SESSION_END]", "SESSION_END event"),
+ (ipver_label, f"{ipver_label} literal"),
+ (loop_addr, f"loopback addr {loop_addr}"),
+ (str(port), f"chosen port {port}"),
+ ("Connection parameters: ISS", "CONN_PARAMS event"),
+ ("MSS set to", "MSS event"),
+ ("[IN]", "[IN] data event"),
+ ("[OUT]", "[OUT] data event"),
+ ("flags SYN", "SYN-bearing segment"),
+ ("in state ESTABLISHED", "ESTABLISHED-state IN event"),
+ ]
+ for needle, label in expectations:
+ _assert_in_decode(decoded, needle, label)
+
+ # Binary file format integrity: decoding twice must produce
+ # byte-identical output. Catches non-deterministic decode bugs
+ # (stale reader state, miscomputed offsets, etc.).
+ decoded2 = decode_capture(capture)
+ assert decoded == decoded2, "elog -r is not deterministic"
+
+ @pytest.mark.require_user("root")
+ @pytest.mark.require_progs(["elog"])
+ @pytest.mark.timeout(60)
+ def test_capture_tcp_dump_state(self, tmp_path):
+ """Validate the elog -D (DUMP_STATE) flow.
+
+ Open a tcp connection BEFORE subscribing, then run elog with -D.
+ The CREATE IOCTL invokes tcp_eventlog_dump_state inline on this
+ thread; the callback walks every TCP PCB and re-emits
+ SESSION_CREATE / CONN_SET_IP_V4 / CONN_PARAMS for each
+ enabled session. This exercises a meaningfully different code
+ path from test_capture_tcp_loopback (where the subscriber
+ attaches first and events flow live).
+ """
+ _require_eventlog_dev()
+
+ capture = tmp_path / "capture.elog"
+ port = _pick_port(salt=1)
+
+ with sysctl_override("net.inet.tcp.functions_default", "freebsd"), \
+ sysctl_override("kern.eventlog.tcp.default", "1"):
+
+ # Stand up a long-lived loopback connection FIRST so its
+ # session pre-exists when elog subscribes.
+ with loopback_pair(socket.AF_INET, port):
+ # Subscribe with -D. The dump callback fires inline.
+ with elog_capture("-c", "tcp", "-D",
+ capture_path=capture):
+ # Give elog time to drain the dumped events.
+ time.sleep(2)
+
+ decoded = decode_capture(capture)
+ assert decoded.strip(), "decoded capture is empty"
+
+ for needle, label in (
+ ("[SESSION_CREATE]", "SESSION_CREATE in dump"),
+ ("IPv4 connection", "CONN_SET_IP_V4 in dump"),
+ ("127.0.0.1", "loopback addr in dump"),
+ (str(port), f"port {port} in dump"),
+ ("Connection parameters: ISS", "CONN_PARAMS in dump"),
+ (", MSS ", "MSS field of CONN_PARAMS in dump"),
+ ):
+ _assert_in_decode(decoded, needle, label)
+
+ @pytest.mark.require_user("root")
+ @pytest.mark.require_progs(["elog"])
+ @pytest.mark.timeout(60)
+ def test_capture_tcp_mass_enable_disable(self, tmp_path):
+ """Validate kern.eventlog.tcp.default=2 / -1 transitions.
+
+ Writing 2 means "enable all currently-disabled sessions"; -1
+ means "disable all currently-active sessions". Plain 0/1
+ transitions only affect future sessions and are covered by
+ test_capture_tcp_loopback. The 2/-1 path runs through
+ tcp_eventlog_default_changed and eventlog_provider_set_all_sessions
+ and otherwise has no end-to-end coverage.
+
+ Strategy: open a long-lived nc-equivalent pair under default=0
+ (sessions come up disabled), snapshot sessions_enabled, flip
+ default=2 and assert it rose, drive a side connection so we can
+ confirm events actually flow under default=2, flip default=-1
+ and assert sessions_enabled fell. Counter deltas are robust
+ against background tcp activity on the test image.
+ """
+ _require_eventlog_dev()
+
+ capture = tmp_path / "capture.elog"
+ port = _pick_port(salt=2)
+ side_port = _pick_port(salt=3)
+
+ with sysctl_override("net.inet.tcp.functions_default", "freebsd"), \
+ sysctl_override("kern.eventlog.tcp.default", "0"):
+
+ with elog_capture("-c", "tcp", capture_path=capture):
+ # Long-lived holding pair. Sessions come up disabled
+ # because default=0 at the moment they're created.
+ with loopback_pair(socket.AF_INET, port):
+ enabled_before = int(_sysctl_get(
+ "kern.eventlog.tcp.sessions_enabled"))
+
+ # === Phase 1: mass enable ===
+ _sysctl_set("kern.eventlog.tcp.default", "2")
+ enabled_after_enable = int(_sysctl_get(
+ "kern.eventlog.tcp.sessions_enabled"))
+ assert enabled_after_enable > enabled_before, (
+ f"default=2 did not raise sessions_enabled: "
+ f"{enabled_before} -> {enabled_after_enable}")
+
+ # Drive a side exchange while logging is on so the
+ # capture has observable events for that connection.
+ loopback_oneshot(socket.AF_INET, side_port,
+ b"mass-enable phase\n")
+ time.sleep(1)
+
+ # === Phase 2: mass disable ===
+ _sysctl_set("kern.eventlog.tcp.default", "-1")
+ enabled_after_disable = int(_sysctl_get(
+ "kern.eventlog.tcp.sessions_enabled"))
+ assert enabled_after_disable < enabled_after_enable, (
+ f"default=-1 did not lower sessions_enabled: "
+ f"{enabled_after_enable} -> {enabled_after_disable}")
+
+ # Brief drain before the holding pair tears down.
+ time.sleep(1)
+
+ decoded = decode_capture(capture)
+ assert decoded.strip(), "decoded capture is empty"
+
+ # The phase-1 side connection was opened *while* default=2 was in
+ # effect, so its sessions started enabled and emitted normal
+ # events. We expect SESSION_CREATE plus IN/OUT for that port.
+ for needle, label in (
+ (str(side_port), f"side connection port {side_port}"),
+ ("[SESSION_CREATE]", "SESSION_CREATE in capture"),
+ ("IPv4 connection", "CONN_SET_IP_V4 in capture"),
+ ("[IN]", "[IN] events in capture"),
+ ("[OUT]", "[OUT] events in capture"),
+ ):
+ _assert_in_decode(decoded, needle, label)
+
+ @pytest.mark.require_user("root")
+ @pytest.mark.require_progs(["elog"])
+ @pytest.mark.timeout(60)
+ @pytest.mark.parametrize(
+ "keyword,expected_present,expected_absent",
+ [
+ pytest.param("RX", "[IN]", "[OUT]", id="RX_only"),
+ pytest.param("TX", "[OUT]", "[IN]", id="TX_only"),
+ ],
+ )
+ def test_capture_tcp_keyword_filter(
+ self, tmp_path, keyword, expected_present, expected_absent):
+ """Validate `-c tcp <level> <keyword>` keyword filtering.
+
+ IN events are tagged with the RX keyword and OUT events with
+ TX (see tcp_eventlog_schema.src). A subscription that
+ specifies only RX must produce IN events but no OUT events,
+ and vice versa. SESSION_CREATE/SESSION_END are tagged with
+ the framework-reserved SESSION keyword which elog(1)
+ unconditionally OR's into every subscription, so those still
+ appear regardless of the user-specified keyword mask.
+ """
+ _require_eventlog_dev()
+
+ capture = tmp_path / "capture.elog"
+ port = _pick_port(salt=20)
+
+ with sysctl_override("net.inet.tcp.functions_default", "freebsd"), \
+ sysctl_override("kern.eventlog.tcp.default", "1"):
+ with elog_capture("-c", "tcp", "VERBOSE", keyword,
+ capture_path=capture):
+ loopback_oneshot(socket.AF_INET, port,
+ b"keyword-filter\n")
+ time.sleep(2)
+
+ decoded = decode_capture(capture)
+ assert decoded.strip(), "decoded capture is empty"
+
+ _assert_in_decode(
+ decoded, expected_present,
+ f"keyword={keyword}: {expected_present} should be present")
+ if expected_absent in decoded:
+ pytest.fail(
+ f"keyword={keyword}: {expected_absent!r} should be "
+ f"filtered out but is present in capture\n"
+ f"--- decoded.txt (first 2000 chars) ---\n"
+ f"{decoded[:2000]}\n--- end ---")
+
+ @pytest.mark.require_user("root")
+ @pytest.mark.require_progs(["elog"])
+ @pytest.mark.timeout(60)
+ def test_capture_tcp_level_filter(self, tmp_path):
+ """Validate `-c tcp <level>` level filtering.
+
+ Subscribing at ERROR (level=1) requests only events at level
+ ERROR or below. All TCP events on a clean exchange are at
+ INFO/VERBOSE/TRACE level (see tcp_eventlog_schema.src), so the
+ capture must contain none of the normal lifecycle events.
+ Compared to a TRACE subscription over the same workload,
+ which produces dozens of events.
+
+ Note: when the level filter drops every event the capture
+ file is 0 bytes (elog only writes the binary header lazily,
+ on the first event). decode_capture_or_empty handles that.
+ """
+ _require_eventlog_dev()
+
+ port_low = _pick_port(salt=21)
+ port_high = _pick_port(salt=22)
+
+ capture_low = tmp_path / "capture-error.elog"
+ capture_high = tmp_path / "capture-trace.elog"
+
+ with sysctl_override("net.inet.tcp.functions_default", "freebsd"), \
+ sysctl_override("kern.eventlog.tcp.default", "1"):
+
+ # Pass A: ERROR level - clean traffic should produce no
+ # tcp lifecycle/data events at this filter level.
+ with elog_capture("-c", "tcp", "ERROR",
+ capture_path=capture_low):
+ loopback_oneshot(socket.AF_INET, port_low,
+ b"level-error\n")
+ time.sleep(2)
+
+ # Pass B: TRACE level - same traffic, full event stream.
+ with elog_capture("-c", "tcp", "TRACE",
+ capture_path=capture_high):
+ loopback_oneshot(socket.AF_INET, port_high,
+ b"level-trace\n")
+ time.sleep(2)
+
+ decoded_low = decode_capture_or_empty(capture_low)
+ decoded_high = decode_capture(capture_high)
+
+ # ERROR pass must not contain any of the normal INFO/VERBOSE
+ # events. We test for [IN] and [OUT] (VERBOSE) plus
+ # CONN_SET_IP_V4's literal output (INFO). All three are absent
+ # at ERROR level. (Empty capture trivially satisfies this.)
+ for needle in ("[IN]", "[OUT]", "IPv4 connection"):
+ if needle in decoded_low:
+ pytest.fail(
+ f"level=ERROR capture should not contain "
+ f"{needle!r} but does\n"
+ f"--- decoded-error.txt (first 2000 chars) ---\n"
+ f"{decoded_low[:2000]}\n--- end ---")
+
+ # TRACE pass must contain the full event stream for the same
+ # workload. This proves the level filter actually filters
+ # (rather than the test's traffic being silently broken).
+ for needle, label in (
+ ("[IN]", "[IN] in TRACE capture"),
+ ("[OUT]", "[OUT] in TRACE capture"),
+ ("IPv4 connection", "CONN_SET_IP_V4 in TRACE capture"),
+ ):
+ _assert_in_decode(decoded_high, needle, label)
+
+ @pytest.mark.require_user("root")
+ @pytest.mark.require_progs(["elog"])
+ @pytest.mark.timeout(60)
+ def test_capture_per_session_output_dir(self, tmp_path):
+ """Validate `-o dir=PATH -f tcp` per-session output mode.
+
+ elog supports splitting captured events into one binary file
+ per session, named by TCP four-tuple when -f tcp is set:
+ <log_id>_<local_port>_<remote_ip>_<remote_port>.elog
+ This is the operational mode used to debug a single
+ connection in the field.
+
+ Drive two distinct loopback connections, then assert:
+ 1) the output dir is created
+ 2) at least one per-session file is created for each
+ connection (matching the chosen port)
+ 3) every per-session file decodes successfully and contains
+ a SESSION_CREATE event for its session
+ """
+ _require_eventlog_dev()
+
+ out_dir = tmp_path / "split"
+ port_a = _pick_port(salt=30)
+ port_b = _pick_port(salt=31)
+
+ with sysctl_override("net.inet.tcp.functions_default", "freebsd"), \
+ sysctl_override("kern.eventlog.tcp.default", "1"):
+ with elog_capture("-c", "tcp", "-f", "tcp",
+ "-o", f"dir={out_dir}"):
+ # Two distinct connections so we can assert both
+ # ports appear in the per-session filenames.
+ for port in (port_a, port_b):
+ loopback_oneshot(socket.AF_INET, port,
+ b"per-session\n")
+ time.sleep(2)
+
+ assert out_dir.is_dir(), \
+ f"elog did not create output dir {out_dir}"
+
+ files = sorted(out_dir.glob("*.elog"))
+ assert files, f"no per-session files under {out_dir}"
+
+ names = [f.name for f in files]
+ for port in (port_a, port_b):
+ matches = [n for n in names if str(port) in n]
+ assert matches, (
+ f"no per-session file mentions port {port}; "
+ f"got: {names}")
+
+ # Every per-session file must be a valid binary capture and
+ # contain a SESSION_CREATE event for the session it represents.
+ for path in files:
+ decoded = decode_capture(path)
+ _assert_in_decode(
+ decoded, "[SESSION_CREATE]",
+ f"{path.name}: SESSION_CREATE in per-session file")
+
+ @pytest.mark.require_user("root")
+ @pytest.mark.require_progs(["elog"])
+ @pytest.mark.timeout(60)
+ def test_sessions_active_no_leak(self):
+ """Open and close N TCP loopback pairs; sessions_active must
+ return to its baseline. Catches PCB-side eventlog session
+ lifecycle leaks (a destroyed PCB whose session record never
+ gets freed would inflate sessions_active monotonically).
+
+ We use SO_LINGER {1, 0} on the client so close() emits a TCP
+ RST and the kernel reclaims both PCBs immediately, avoiding
+ TIME_WAIT and keeping the test runtime bounded.
+ """
+ _require_eventlog_dev()
+
+ N = 10
+
+ with sysctl_override("net.inet.tcp.functions_default", "freebsd"), \
+ sysctl_override("kern.eventlog.tcp.default", "1"):
+
+ # Settle for a moment then sample baseline. The test image
+ # has sshd, possibly other tcp sessions; we tolerate small
+ # drift.
+ time.sleep(1)
+ baseline = int(_sysctl_get(
+ "kern.eventlog.tcp.sessions_active"))
+
+ # Each iteration creates exactly two enabled tcp sessions
+ # (one per PCB) and immediately destroys both via RST.
+ for i in range(N):
+ with loopback_pair(socket.AF_INET,
+ _pick_port(salt=40 + i),
+ linger0=True):
+ pass
+
+ # Give the kernel a moment to reclaim PCBs and drain any
+ # SESSION_END processing.
+ time.sleep(2)
+
+ final = int(_sysctl_get(
+ "kern.eventlog.tcp.sessions_active"))
+
+ delta = final - baseline
+ # A real leak shows up as delta >= 2*N (one session per PCB,
+ # two PCBs per pair). Tolerate small ambient growth (e.g.,
+ # the test runner's own ssh shell churning) up to N - this
+ # is well below the 2*N a leak would produce.
+ assert delta < N, (
+ f"tcp eventlog sessions_active leak: "
+ f"baseline={baseline} -> final={final} "
+ f"(delta={delta}, opened={N} pairs)")
+
+ @pytest.mark.require_user("root")
+ @pytest.mark.require_progs(["elog"])
+ @pytest.mark.timeout(60)
+ def test_capture_tcp_binary_format(self, tmp_path):
+ """Validate the on-disk binary capture format end-to-end by
+ decoding it directly in Python (no `elog -r` round-trip).
+
+ `elog -r` is forgiving about its own output: it could trade
+ layouts back and forth without anyone noticing. Real
+ consumers of the binary stream - external collectors, custom
+ analyzers, off-host tooling - aren't, and any change to the
+ record layout, header struct, or framework-emitted events is
+ an ABI change for them. Parsing the file with a hand-written
+ Python decoder catches those changes immediately.
+
+ Asserts on the binary file directly:
+ - elog_binary_header magic / version / start_utc_us / counts
+ are sane and self-consistent (event_count matches the
+ number of records actually walked, dropped_events=0).
+ - Provider map contains the tcp provider id.
+ - Each event header has provider_id matching the tcp
+ provider, event_length is plausible (>= header,
+ <= a generous max), and the recorded payload size
+ agrees with event_length - sizeof(header).
+ - Per-session timestamps are monotonically non-decreasing
+ within the same session_id.
+ - Loopback creates two PCBs (client + server) so we expect
+ >= 2 distinct session_ids that each transit
+ SESSION_CREATE -> ... -> SESSION_END, with at least one
+ CONN_SET_IP_V4, MSS, and one of IN/OUT in between.
+ - SESSION_CREATE payload is exactly 8 bytes (void *tp)
+ and tp is non-zero (real PCB pointer from the kernel).
+ - CONN_SET_IP_V4 payload is exactly 16 bytes and decodes
+ to (127.0.0.1, port, 127.0.0.1, port) for the loopback
+ four-tuple, with our chosen port present in at least
+ one of the captured CONN_SET_IP_V4 events.
+ """
+ _require_eventlog_dev()
+
+ capture = tmp_path / "capture.elog"
+ port = _pick_port(salt=50)
+ payload = b"binary-format\n"
+
+ with sysctl_override("net.inet.tcp.functions_default", "freebsd"), \
+ sysctl_override("kern.eventlog.tcp.default", "1"):
+ with elog_capture("-c", "tcp", capture_path=capture):
+ received = loopback_oneshot(socket.AF_INET, port, payload)
+ assert payload.rstrip(b"\n") in received
+ # Wait long enough for the kernel to push everything
+ # and elog to drain its read; otherwise event_count
+ # in the file header may be < the events we expect.
+ time.sleep(2)
+
+ file_hdr, providers, events = parse_elog_binary(capture)
+
+ # File-header invariants. event_count is updated in-place at
+ # close (update_binary_header in elog.c), so it must equal
+ # what we walked.
+ assert file_hdr.event_count == len(events), (
+ f"event_count in header ({file_hdr.event_count}) != "
+ f"events walked ({len(events)})")
+ assert file_hdr.dropped_events == 0, (
+ f"kernel reported {file_hdr.dropped_events} dropped events "
+ f"during the capture")
+ assert file_hdr.capture_start > 0, (
+ "capture_start should be the boot-relative timestamp at "
+ "capture start; got 0")
+ assert file_hdr.start_utc_us > 0, (
+ "start_utc_us should be the wall-clock UTC microseconds "
+ "at capture start; got 0")
+
+ # Provider map invariants: at least one entry named "tcp".
+ # Each TCP function block (freebsd, rack, ngen_*) registers its
+ # own eventlog_provider and the kernel's name-based subscribe
+ # path matches every provider whose name is "tcp", so a single
+ # `-c tcp` subscription can pull in multiple provider ids on
+ # an image that builds more than one stack. The invariant is
+ # that *every* event in the capture came from one of those tcp
+ # providers, not that there is exactly one of them.
+ tcp_ids = {pid for pid, name in providers.items()
+ if name == "tcp"}
+ assert tcp_ids, (
+ f"no tcp provider in capture; got {providers!r}")
+
+ # Per-event invariants and per-session bookkeeping. We check
+ # invariants on EVERY event in the file but only build the
+ # detailed per-session event-id map: VMs always have some
+ # background TCP activity, and per-session content assertions
+ # only make sense for the sessions we created ourselves
+ # (identified below by matching the CONN_SET_IP_V4 four-tuple
+ # against `port`).
+ assert events, "no events recorded in capture"
+ per_session_last_ts = {}
+ per_session_event_ids = {}
+ for hdr, ev_payload in events:
+ assert hdr.provider_id in tcp_ids, (
+ f"unexpected provider_id={hdr.provider_id} in event "
+ f"(tcp provider ids: {sorted(tcp_ids)}; "
+ f"providers: {providers!r})")
+ # event_length self-consistency.
+ assert (hdr.event_length ==
+ ELOG_EVENT_HEADER_SIZE + len(ev_payload)), (
+ f"event_length {hdr.event_length} != header "
+ f"({ELOG_EVENT_HEADER_SIZE}) + payload "
+ f"({len(ev_payload)})")
+ # Sanity: no individual event should be huge. The kernel
+ # caps payloads at a few hundred bytes; 4 KiB is a
+ # generous upper bound that still flags corruption.
+ assert hdr.event_length <= 4096, (
+ f"event_length {hdr.event_length} suspiciously large")
+ # Timestamp monotonicity within a session_id. Different
+ # sessions can interleave (different CPUs), so we don't
+ # check globally - only within a session.
+ last = per_session_last_ts.get(hdr.session_id)
+ if last is not None:
+ assert hdr.timestamp >= last, (
+ f"timestamp regression in session "
+ f"{hdr.session_id}: {last} -> {hdr.timestamp}")
+ per_session_last_ts[hdr.session_id] = hdr.timestamp
+ per_session_event_ids.setdefault(
+ hdr.session_id, []).append(hdr.event_id)
+
+ # Identify *our* loopback sessions: the ones whose
+ # CONN_SET_IP_V4 four-tuple touches `port`. The VM has
+ # incidental TCP activity (e.g. a kernel-internal session
+ # that may transit only SESSION_CREATE/SESSION_END), so we
+ # can't assume that every session in the capture belongs to
+ # this test.
+ v4_events = [
+ (h, p) for h, p in events
+ if h.event_id == TCP_EVENT_CONN_SET_IP_V4]
+ assert v4_events, "no CONN_SET_IP_V4 events to validate"
+
+ our_sessions = set()
+ seen_ports = set()
+ for hdr, ev_payload in v4_events:
+ info = parse_conn_set_ip_v4_payload(ev_payload)
+ seen_ports.add(info["src_port"])
+ seen_ports.add(info["dst_port"])
+ if info["src_port"] == port or info["dst_port"] == port:
+ # Validate the CONN_SET_IP_V4 payload structure for
+ # connections we know are ours: addresses must be
+ # loopback. (We don't enforce 127.0.0.1 on every
+ # CONN_SET_IP_V4 in the capture; an unrelated kernel
+ # session could legitimately have a different IP.)
+ assert info["src_addr"] == "127.0.0.1", (
+ f"CONN_SET_IP_V4 src {info['src_addr']!r} != "
+ f"127.0.0.1 in our session {hdr.session_id}")
+ assert info["dst_addr"] == "127.0.0.1", (
+ f"CONN_SET_IP_V4 dst {info['dst_addr']!r} != "
+ f"127.0.0.1 in our session {hdr.session_id}")
+ our_sessions.add(hdr.session_id)
+ assert port in seen_ports, (
+ f"chosen port {port} not present in any CONN_SET_IP_V4 "
+ f"four-tuple (saw ports: {sorted(seen_ports)})")
+ # Loopback creates two PCBs (client + server), each with its
+ # own eventlog session, so we expect exactly 2 of "ours". We
+ # accept >= 2 to remain robust against unforeseen retries
+ # while still catching a "we lost a side" regression.
+ assert len(our_sessions) >= 2, (
+ f"expected ≥2 loopback sessions touching port {port}, "
+ f"got {len(our_sessions)}: {sorted(our_sessions)}")
+
+ # For each session WE created, validate the full lifecycle
+ # is present in the binary record stream.
+ for sid in sorted(our_sessions):
+ ids = per_session_event_ids[sid]
+ assert EVENTLOG_SESSION_CREATE_ID in ids, (
+ f"session {sid}: no SESSION_CREATE event "
+ f"(event_ids: {sorted(set(ids))})")
+ assert EVENTLOG_SESSION_END_ID in ids, (
+ f"session {sid}: no SESSION_END event "
+ f"(event_ids: {sorted(set(ids))})")
+ assert TCP_EVENT_CONN_SET_IP_V4 in ids, (
+ f"session {sid}: no CONN_SET_IP_V4 event "
+ f"(event_ids: {sorted(set(ids))})")
+ assert TCP_EVENT_MSS in ids, (
+ f"session {sid}: no MSS event "
+ f"(event_ids: {sorted(set(ids))})")
+ assert (TCP_EVENT_IN in ids) or (TCP_EVENT_OUT in ids), (
+ f"session {sid}: no IN or OUT data event "
+ f"(event_ids: {sorted(set(ids))})")
+
+ # Validate the SESSION_CREATE payload structure for the
+ # SESSION_CREATE events of our loopback sessions: 8-byte tp
+ # pointer that the kernel passes from
+ # tcp_eventlog_session_init must be non-zero.
+ our_sc_events = [
+ (h, p) for h, p in events
+ if (h.event_id == EVENTLOG_SESSION_CREATE_ID
+ and h.session_id in our_sessions)]
+ assert len(our_sc_events) >= 2, (
+ f"expected ≥2 SESSION_CREATE events for our loopback "
+ f"sessions, got {len(our_sc_events)}")
+ for hdr, ev_payload in our_sc_events:
+ sc = parse_session_create_payload(ev_payload)
+ assert sc["tp"] != 0, (
+ f"SESSION_CREATE for session {hdr.session_id}: "
+ f"tp pointer is 0 (kernel did not record tcpcb)")
diff --git a/usr.bin/elog/elog.1 b/usr.bin/elog/elog.1
--- a/usr.bin/elog/elog.1
+++ b/usr.bin/elog/elog.1
@@ -65,7 +65,7 @@
Subscribe to events from the specified provider.
.Bl -tag -width indent
.It Ar provider
-The name of the eventlog provider.
+The name of the eventlog provider (e.g., "tcp").
This argument is required.
.It Ar level
Optional log level.
@@ -106,6 +106,18 @@
.It Fl -event-name
Show the event name (e.g., IN, OUT, SESSION_CREATE) in square brackets
after the session ID in each output line.
+.It Fl f Ar type
+.It Fl -format Ar type
+Rename per-session output files with connection metadata.
+The only supported
+.Ar type
+is
+.Cm tcp ,
+which renames files with TCP connection metadata.
+Files are named using the format
+.Ar log_id Ns _ Ns Ar local_port Ns _ Ns Ar remote_ip Ns _ Ns Ar remote_port Ns .elog .
+Requires
+.Fl o Cm dir= Ns Ar path .
.It Fl n
.It Fl -event-number
Print a serial event number at the beginning of each output line.
@@ -121,7 +133,9 @@
.It Fl o Cm dir= Ns Ar path
Write one binary file per session under
.Ar path .
-Each session gets its own file, named by session ID.
+Each session gets its own file, named by session ID (or by TCP metadata
+when combined with
+.Fl f Cm tcp ) .
The directory is created if it does not exist.
.It Fl p
.It Fl -providers
@@ -261,49 +275,49 @@
.Sh EXIT STATUS
.Ex -std
.Sh EXAMPLES
-Subscribe to all events from a provider at VERBOSE level with all keywords:
+Subscribe to all TCP events at VERBOSE level with all keywords:
.Bd -literal -offset indent
-elog -c provider
+elog -c tcp
.Ed
.Pp
-Subscribe to a provider at INFO level with all keywords:
+Subscribe to TCP events at INFO level with all keywords:
.Bd -literal -offset indent
-elog -c provider INFO
+elog -c tcp INFO
.Ed
.Pp
-Subscribe to a provider at INFO level with specific keywords by name:
+Subscribe to TCP events at INFO level with specific keywords by name:
.Bd -literal -offset indent
-elog -c provider INFO KEYWORD1|KEYWORD2
+elog -c tcp INFO CC|RX|TX|PACING|CONN
.Ed
.Pp
-Subscribe to a provider at INFO level with keywords as hex mask:
+Subscribe to TCP events at INFO level with keywords as hex mask:
.Bd -literal -offset indent
-elog -c provider INFO 0x3F
+elog -c tcp INFO 0x3F
.Ed
.Pp
-Subscribe to a provider at TRACE level:
+Subscribe to all TCP events including TRACE-level detail:
.Bd -literal -offset indent
-elog -c provider TRACE
+elog -c tcp TRACE
.Ed
.Pp
Subscribe to multiple providers:
.Bd -literal -offset indent
-elog -c provider1 INFO -c provider2 WARN
+elog -c tcp INFO -c provider WARN
.Ed
.Pp
Subscribe using numeric levels:
.Bd -literal -offset indent
-elog -c provider 3 0x3F
+elog -c tcp 3 0x3F
.Ed
.Pp
Set buffer size to 4MB per CPU:
.Bd -literal -offset indent
-elog -b 4M -c provider
+elog -b 4M -c tcp
.Ed
.Pp
Write output to a file (binary format):
.Bd -literal -offset indent
-elog -c provider -o /tmp/events.bin
+elog -c tcp -o /tmp/events.bin
.Ed
.Pp
Read binary file and convert to text:
@@ -313,27 +327,27 @@
.Pp
Show relative timestamps from trace start:
.Bd -literal -offset indent
-elog -t -c provider
+elog -t -c tcp
.Ed
.Pp
Show both relative and inter-event delta timestamps:
.Bd -literal -offset indent
-elog -t --delta-time -c provider
+elog -t --delta-time -c tcp
.Ed
.Pp
-Write per-session files into a directory:
+Write per-session files with TCP metadata naming:
.Bd -literal -offset indent
-elog -o dir=/tmp/traces -c provider
+elog -o dir=/tmp/traces -f tcp -c tcp
.Ed
.Pp
Print statistics on exit:
.Bd -literal -offset indent
-elog -s -c provider
+elog -s -c tcp
.Ed
.Pp
Run a fixed-duration capture, suitable for scripted use:
.Bd -literal -offset indent
-elog --duration 30 -c provider1 -c provider2 -o /tmp/cap.elog
+elog --duration 30 -c tcp -o /tmp/cap.elog
.Ed
.Sh SEE ALSO
.Xr elog 5 ,
diff --git a/usr.bin/elog/elog.c b/usr.bin/elog/elog.c
--- a/usr.bin/elog/elog.c
+++ b/usr.bin/elog/elog.c
@@ -26,6 +26,9 @@
#include <sys/time.h>
#include <time.h>
#include <signal.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
#include <zlib.h>
/* Include consumer header for formatting events */
@@ -65,6 +68,8 @@
static uint64_t prev_event_ts = 0;
static bool dump_state = false; /* Replay current state on subscribe */
static char *output_dir = NULL; /* If set, one file per session */
+/* Rename per-session files using TCP connection metadata */
+static bool format_tcp = false;
/* Per-session file state for -o dir= mode */
struct session_file {
@@ -76,6 +81,13 @@
uint64_t capture_start;
uint64_t start_utc_us;
uint64_t event_count;
+ /* TCP connection metadata for file rename (-f tcp). */
+ char log_id[64];
+ char remote_ip[INET6_ADDRSTRLEN];
+ uint16_t local_port;
+ uint16_t remote_port;
+ bool has_log_id;
+ bool has_conn_info;
};
static STAILQ_HEAD(, session_file) session_files =
STAILQ_HEAD_INITIALIZER(session_files);
@@ -150,6 +162,8 @@
" -s, --stats Print detailed statistics on exit\n"
" -o, --output <file> Write binary output to file (default: stdout)\n"
" -o dir=<path> Write one binary file per session under <path>\n"
+" -f, --format <type> Rename per-session files using connection metadata\n"
+" Supported types: tcp\n"
" -r, --read-binary <file>\n"
" Read binary file and convert to text (.gz ok)\n"
" -t, --relative-time Show time relative to first event\n"
@@ -376,9 +390,9 @@
if (j == 0)
snprintf(sanitized, sizeof(sanitized), "global");
- sf = malloc(sizeof(*sf));
+ sf = calloc(1, sizeof(*sf));
if (sf == NULL)
- err(1, "malloc(session_file)");
+ err(1, "calloc(session_file)");
sf->session_id = strdup(session_id);
if (sf->session_id == NULL)
err(1, "strdup");
@@ -391,9 +405,6 @@
sf->fp = fopen(fullpath, "wb");
if (sf->fp == NULL)
err(1, "fopen(%s)", fullpath);
- sf->header_written = false;
- sf->capture_start = 0;
- sf->event_count = 0;
STAILQ_INSERT_TAIL(&session_files, sf, link);
return (sf->fp);
}
@@ -438,6 +449,107 @@
err(1, "fwrite(binary header)");
}
+/*
+ * Extract TCP connection metadata from CONN_SET_IP_V[46] / LOG_ID
+ * events for the file-rename feature (-f tcp). Only the first log_id
+ * and first connection address seen for a session are captured.
+ */
+static void
+extract_tcp_metadata(struct session_file *sf,
+ const struct eventlog_event_header *hdr,
+ const void *payload, size_t payload_size)
+{
+ if (sf == NULL || (sf->has_log_id && sf->has_conn_info) ||
+ strcmp(get_provider_name(hdr->provider_id), "tcp") != 0)
+ return;
+
+ switch (hdr->event_id) {
+ case TCP_EVENTLOG_CONN_SET_IP_V4_ID: {
+ const struct tcp_eventlog_conn_set_ip_v4 *evt;
+ if (sf->has_conn_info)
+ break;
+ if (payload_size < sizeof(*evt))
+ break;
+ evt = (const struct tcp_eventlog_conn_set_ip_v4 *)payload;
+ sf->local_port = ntohs(evt->src_port);
+ sf->remote_port = ntohs(evt->dst_port);
+ inet_ntop(AF_INET, &evt->dst_addr, sf->remote_ip,
+ sizeof(sf->remote_ip));
+ sf->has_conn_info = true;
+ break;
+ }
+ case TCP_EVENTLOG_CONN_SET_IP_V6_ID: {
+ const struct tcp_eventlog_conn_set_ip_v6 *evt;
+ if (sf->has_conn_info)
+ break;
+ if (payload_size < sizeof(*evt))
+ break;
+ evt = (const struct tcp_eventlog_conn_set_ip_v6 *)payload;
+ sf->local_port = ntohs(evt->src_port);
+ sf->remote_port = ntohs(evt->dst_port);
+ inet_ntop(AF_INET6, &evt->dst_addr, sf->remote_ip,
+ sizeof(sf->remote_ip));
+ sf->has_conn_info = true;
+ break;
+ }
+ case TCP_EVENTLOG_LOG_ID_ID: {
+ const struct tcp_eventlog_log_id *evt;
+ if (sf->has_log_id)
+ break;
+ if (payload_size < sizeof(*evt))
+ break;
+ evt = (const struct tcp_eventlog_log_id *)payload;
+ if (evt->log_id[0] != '\0') {
+ strlcpy(sf->log_id, evt->log_id, sizeof(sf->log_id));
+ sf->has_log_id = true;
+ }
+ break;
+ }
+ }
+}
+
+/*
+ * Rename a session file based on captured TCP metadata.
+ * Format: <log_id>_<local_port>_<remote_ip>_<remote_port>.elog
+ * Missing fields are replaced with "unknown". If no metadata at all,
+ * skip the rename.
+ */
+static void
+rename_session_file(struct session_file *sf)
+{
+ char newpath[PATH_MAX];
+ const char *log_id_str, *remote_ip_str;
+ char local_port_str[8], remote_port_str[8];
+
+ if (!format_tcp || sf == NULL || sf->filepath == NULL ||
+ output_dir == NULL)
+ return;
+ if (!sf->has_log_id && !sf->has_conn_info)
+ return;
+
+ log_id_str = sf->has_log_id ? sf->log_id : "unknown";
+ remote_ip_str = sf->has_conn_info ? sf->remote_ip : "unknown";
+ if (sf->has_conn_info) {
+ snprintf(local_port_str, sizeof(local_port_str), "%u",
+ sf->local_port);
+ snprintf(remote_port_str, sizeof(remote_port_str), "%u",
+ sf->remote_port);
+ } else {
+ strlcpy(local_port_str, "unknown", sizeof(local_port_str));
+ strlcpy(remote_port_str, "unknown", sizeof(remote_port_str));
+ }
+
+ if (snprintf(newpath, sizeof(newpath), "%s/%s_%s_%s_%s.elog",
+ output_dir, log_id_str, local_port_str, remote_ip_str,
+ remote_port_str) >= (int)sizeof(newpath))
+ return;
+
+ if (rename(sf->filepath, newpath) == 0) {
+ free(sf->filepath);
+ sf->filepath = strdup(newpath);
+ }
+}
+
static void
close_session_file(const char *session_id)
{
@@ -451,6 +563,7 @@
fflush(sf->fp);
fclose(sf->fp);
sf->fp = NULL;
+ rename_session_file(sf);
STAILQ_REMOVE(&session_files, sf, session_file, link);
free(sf->filepath);
free(sf->session_id);
@@ -491,7 +604,8 @@
struct session_file *sf;
size_t event_length;
- snprintf(session_id_str, sizeof(session_id_str), "%lu",
+ snprintf(session_id_str, sizeof(session_id_str), "%s_%lu",
+ get_provider_name(hdr->provider_id),
(unsigned long)hdr->session_id);
if (output_dir != NULL) {
@@ -515,6 +629,8 @@
sf->start_utc_us = utc_us;
}
sf->event_count++;
+ if (format_tcp)
+ extract_tcp_metadata(sf, hdr, payload, payload_size);
}
event_length = sizeof(struct eventlog_event_header) + payload_size;
@@ -1013,6 +1129,7 @@
{ "event-number", no_argument, NULL, 'n' },
{ "providers", no_argument, NULL, 'p' },
{ "stats", no_argument, NULL, 's' },
+ { "format", required_argument, NULL, 'f' },
{ "output", required_argument, NULL, 'o' },
{ "read-binary", required_argument, NULL, 'r' },
{ "relative-time", no_argument, NULL, 't' },
@@ -1028,7 +1145,7 @@
* [keywords]" can consume the operands that follow it; getopt_long
* still allows clustered flags such as "-enr file.elog".
*/
- while ((ch = getopt_long(argc, argv, "+b:c:dDehno:pr:st", longopts,
+ while ((ch = getopt_long(argc, argv, "+b:c:dDef:hno:pr:st", longopts,
NULL)) != -1) {
switch (ch) {
case 'c':
@@ -1090,6 +1207,11 @@
case 'D':
dump_state = true;
break;
+ case 'f':
+ if (strcmp(optarg, "tcp") != 0)
+ errx(1, "--format requires 'tcp' argument");
+ format_tcp = true;
+ break;
case 'h':
usage();
break;
@@ -1231,6 +1353,7 @@
fflush(sf->fp);
fclose(sf->fp);
sf->fp = NULL;
+ rename_session_file(sf);
}
free(sf->filepath);
free(sf->session_id);
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Sun, Jun 21, 2:32 AM (6 h, 54 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
34136693
Default Alt Text
D57309.id179809.diff (118 KB)
Attached To
Mode
D57309: tcp: add eventlog(9) provider for TCP
Attached
Detach File
Event Timeline
Log In to Comment