Page MenuHomeFreeBSD

D57309.id179809.diff
No OneTemporary

D57309.id179809.diff

diff --git a/include/eventlog/tcp_eventlog_schema.src b/include/eventlog/tcp_eventlog_schema.src
new file mode 100644
--- /dev/null
+++ b/include/eventlog/tcp_eventlog_schema.src
@@ -0,0 +1,656 @@
+/*
+ * Copyright (c) 2026 Netflix, Inc.
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+/*
+ * This file defines the schema for TCP event logging.
+ * It is processed by eventlog_gen.awk to generate the event log header
+ *
+ * Format:
+ * PROVIDER <name>
+ * Defines the provider name used as a prefix for all generated definitions
+ *
+ * ENUM <name> <value1>:<name1> <value2>:<name2> ...
+ * Defines an enumeration for converting numeric values to strings.
+ * Used for single-value fields (e.g., ENUM timer_reason 0:PACING 1:DELACK)
+ *
+ * FLAG <name> <value1>:<name1> <value2>:<name2> ...
+ * Defines a flag set for converting bitmask values to strings.
+ * Used for fields that can have multiple flags combined (e.g., FLAG tcp_flags 1:SYN 2:ACK)
+ *
+ * STRUCT <name> <field1>:<type1>[:enum_or_flag] <field2>:<type2>[:enum_or_flag] ...
+ * Defines a structure for event data. Each field is specified as name:type[:enum_or_flag]
+ * (e.g., STRUCT IN seq:uint32_t ack:uint32_t)
+ * For enum/flag fields, use enum_<enum_name> or flag_<flag_name> (e.g., reason:uint8_t:enum_timer_reason)
+ *
+ * KEYWORD <name> <value>
+ * Defines a keyword flag with a numeric value (e.g., KEYWORD DATA 1)
+ *
+ * EVENT <name> <id> <level> <keywords> <struct> <print_format>
+ * Defines an event. Extra spaces are allowed for alignment.
+ * Format strings should use %s for enum/flag fields (e.g., "reason=%s")
+ *
+ * Where:
+ * name: Event name (e.g., IN)
+ * id: Numeric event ID
+ * level: Log level (ERROR, WARN, INFO, VERBOSE)
+ * keywords: Comma-separated list of keyword flags (e.g., DATA)
+ * struct: Name of the STRUCT definition to use for this event, or NONE for
+ * events with no payload. NONE events generate macros that take only
+ * the session parameter and emit zero-length payloads.
+ * print_format: printf-style format string for text output (use %s for enum/flag fields)
+ *
+ * Schema evolution
+ *
+ * Event payloads are decoded by struct layout: the consumer maps an event ID
+ * to a STRUCT and reads sizeof(STRUCT) bytes (or, for size-aware consumers,
+ * the payload length carried in the event header). Two kinds of changes are
+ * supported:
+ *
+ * 1. Appending fields to an existing STRUCT is allowed. Size-aware
+ * consumers will pick up the new tail; older consumers that decode by
+ * exact-size match must be updated (or made size-aware) before they
+ * can read captures emitted by the new kernel. Existing on-disk
+ * captures with the smaller payload continue to decode against the
+ * older field list — readers should treat any missing tail fields as
+ * zero.
+ *
+ * 2. Reordering, removing, or retyping fields in an existing STRUCT is
+ * NOT allowed — it silently breaks decoding of older captures.
+ * Instead, define a new STRUCT and a new EVENT with a `_V2` (or
+ * `_V3`, ...) suffix and a new event ID, put the new fields there,
+ * and mark the previous EVENT as deprecated in a comment while
+ * keeping its definition so older captures continue to decode.
+ *
+ * Consumers learn new event IDs at their own pace; unrecognized event IDs
+ * render as [UNKNOWN_EVENT_ID:N].
+ */
+
+PROVIDER tcp
+
+/* Packet direction */
+KEYWORD RX 1
+KEYWORD TX 2
+/* Loss detection and recovery (RTO, retransmit, DSACK, enter/exit recovery) */
+KEYWORD LOSS 3
+/* Congestion control algorithm state (cwnd, PRR, hystart, goodput, RTT) */
+KEYWORD CC 4
+/* Pacing decisions and hardware offload */
+KEYWORD PACING 5
+/* Protocol timer machinery */
+KEYWORD TIMER 6
+/* Application-facing events (sends, socket options, socket buffer wakeups) */
+KEYWORD APP 7
+/* Connection lifecycle and identity (address binding, log ID) */
+KEYWORD CONN 8
+/* High-frequency RTT samples (every SRTT update) */
+KEYWORD RTT 9
+
+/*
+ * Enumerations
+ */
+
+ENUM timer_reason
+ 0:PACING
+ 1:DELAYACK
+ 2:RACK
+ 3:TLP
+ 4:RXT
+ 5:PERSIST
+ 6:KEEPALIVE
+ 7:TWOMSL
+
+ENUM tcp_state
+ 0:CLOSED
+ 1:LISTEN
+ 2:SYN_SENT
+ 3:SYN_RECEIVED
+ 4:ESTABLISHED
+ 5:CLOSE_WAIT
+ 6:FIN_WAIT_1
+ 7:CLOSING
+ 8:LAST_ACK
+ 9:FIN_WAIT_2
+ 10:TIME_WAIT
+
+ENUM socket_option
+ 1:TCP_NODELAY
+ 2:TCP_MAXSEG
+ 4:TCP_NOPUSH
+ 8:TCP_NOOPT
+ 16:TCP_MD5SIG
+ 32:TCP_INFO
+ 33:TCP_STATS
+ 34:TCP_LOG
+ 35:TCP_LOGBUF
+ 36:TCP_LOGID
+ 37:TCP_LOGDUMP
+ 38:TCP_LOGDUMPID
+ 39:TCP_TXTLS_ENABLE
+ 40:TCP_TXTLS_MODE
+ 41:TCP_RXTLS_ENABLE
+ 42:TCP_RXTLS_MODE
+ 43:TCP_IWND_NB
+ 44:TCP_IWND_NSEG
+ 45:TCP_USE_DDP
+ 46:TCP_LOGID_CNT
+ 47:TCP_LOG_TAG
+ 48:TCP_USER_LOG
+ 64:TCP_CONGESTION
+ 65:TCP_CCALGOOPT
+ 68:TCP_MAXUNACKTIME
+ 70:TCP_IDLE_REDUCE
+ 71:TCP_REMOTE_UDP_ENCAPS_PORT
+ 72:TCP_DELACK
+ 73:TCP_FIN_IS_RST
+ 74:TCP_LOG_LIMIT
+ 75:TCP_SHARED_CWND_ALLOWED
+ 76:TCP_PROC_ACCOUNTING
+ 77:TCP_USE_CMP_ACKS
+ 78:TCP_PERF_INFO
+ 79:TCP_RST_REASON_CODE
+ 80:TCP_RST_REASON_ENABLE
+ 81:TCP_EVENTLOG
+ 128:TCP_KEEPINIT
+ 256:TCP_KEEPIDLE
+ 512:TCP_KEEPINTVL
+ 1024:TCP_KEEPCNT
+ 1025:TCP_FASTOPEN
+ 1026:TCP_REUSPORT_LB_NUMA
+ 1050:TCP_RACK_MBUF_QUEUE
+ 1052:TCP_RACK_TLP_REDUCE
+ 1054:TCP_RACK_PACE_MAX_SEG
+ 1055:TCP_RACK_PACE_ALWAYS
+ 1057:TCP_RACK_PRR_SENDALOT
+ 1058:TCP_RACK_MIN_TO
+ 1060:TCP_RACK_EARLY_SEG
+ 1061:TCP_RACK_REORD_THRESH
+ 1062:TCP_RACK_REORD_FADE
+ 1063:TCP_RACK_TLP_THRESH
+ 1064:TCP_RACK_PKT_DELAY
+ 1067:TCP_BBR_IWINTSO
+ 1069:TCP_BBR_STARTUP_PG
+ 1070:TCP_BBR_DRAIN_PG
+ 1072:TCP_BBR_PROBE_RTT_INT
+ 1074:TCP_BBR_STARTUP_LOSS_EXIT
+ 1076:TCP_BBR_TSLIMITS
+ 1077:TCP_BBR_PACE_OH
+ 1079:TCP_BBR_USEDEL_RATE
+ 1080:TCP_BBR_MIN_RTO
+ 1081:TCP_BBR_MAX_RTO
+ 1083:TCP_BBR_ALGORITHM
+ 1086:TCP_BBR_PACE_PER_SEC
+ 1087:TCP_BBR_PACE_DEL_TAR
+ 1088:TCP_BBR_PACE_SEG_MAX
+ 1089:TCP_BBR_PACE_SEG_MIN
+ 1090:TCP_BBR_PACE_CROSS
+ 1095:TCP_RACK_TLP_USE
+ 1096:TCP_BBR_TMR_PACE_OH
+ 1097:TCP_RACK_DO_DETECTION
+ 1098:TCP_BBR_RACK_RTT_USE
+ 1099:TCP_BBR_RETRAN_WTSO
+ 1100:TCP_DATA_AFTER_CLOSE
+ 1101:TCP_BBR_PROBE_RTT_GAIN
+ 1102:TCP_BBR_PROBE_RTT_LEN
+ 1103:TCP_BBR_SEND_IWND_IN_TSO
+ 1104:TCP_BBR_USE_RACK_RR
+ 1105:TCP_BBR_HDWR_PACE
+ 1106:TCP_BBR_UTTER_MAX_TSO
+ 1107:TCP_BBR_EXTRA_STATE
+ 1108:TCP_BBR_FLOOR_MIN_TSO
+ 1109:TCP_BBR_MIN_TOPACEOUT
+ 1110:TCP_BBR_TSTMP_RAISES
+ 1111:TCP_BBR_POLICER_DETECT
+ 1112:TCP_BBR_RACK_INIT_RATE
+ 1113:TCP_RACK_RR_CONF
+ 1114:TCP_RACK_GP_INCREASE_CA
+ 1115:TCP_RACK_GP_INCREASE_SS
+ 1116:TCP_RACK_GP_INCREASE_REC
+ 1117:TCP_RACK_FORCE_MSEG
+ 1118:TCP_RACK_PACE_RATE_CA
+ 1119:TCP_RACK_PACE_RATE_SS
+ 1120:TCP_RACK_PACE_RATE_REC
+ 1122:TCP_NO_PRR
+ 1123:TCP_RACK_NONRXT_CFG_RATE
+ 1124:TCP_SHARED_CWND_ENABLE
+ 1125:TCP_TIMELY_DYN_ADJ
+ 1126:TCP_RACK_NO_PUSH_AT_MAX
+ 1127:TCP_RACK_PACE_TO_FILL
+ 1128:TCP_SHARED_CWND_TIME_LIMIT
+ 1129:TCP_RACK_PROFILE
+ 1130:TCP_HDWR_RATE_CAP
+ 1131:TCP_PACING_RATE_CAP
+ 1132:TCP_HDWR_UP_ONLY
+ 1133:TCP_RACK_ABC_VAL
+ 1134:TCP_REC_ABC_VAL
+ 1135:TCP_RACK_MEASURE_CNT
+ 1136:TCP_DEFER_OPTIONS
+ 1138:TCP_RACK_PACING_BETA
+ 1139:TCP_RACK_PACING_BETA_ECN
+ 1140:TCP_RACK_TIMER_SLOP
+ 1141:TCP_RACK_DSACK_OPT
+ 1142:TCP_RACK_ENABLE_HYSTART
+ 1143:TCP_RACK_SET_RXT_OPTIONS
+ 1144:TCP_RACK_HI_BETA
+ 1145:TCP_RACK_SPLIT_LIMIT
+ 1146:TCP_RACK_PACING_DIVISOR
+ 1147:TCP_RACK_PACE_MIN_SEG
+ 1148:TCP_RACK_DGP_IN_REC
+ 1150:TCP_HYBRID_PACING
+ 1151:TCP_PACING_DND
+ 1152:TCP_SS_EEXIT
+ 1153:TCP_DGP_UPPER_BOUNDS
+ 1154:TCP_NO_TIMELY
+ 1155:TCP_HONOR_HPTS_MIN
+ 1156:TCP_REC_IS_DYN
+ 1157:TCP_SIDECHAN_DIS
+ 1158:TCP_FILLCW_RATE_CAP
+ 1160:TCP_STACK_SPEC_INFO
+ 1161:RACK_CSPR_IS_FCC
+ 1162:TCP_GP_USE_LTBW
+ 1163:TCP_NGEN_GPUT_MAXSEG
+ 8192:TCP_FUNCTION_BLK
+ 8193:TCP_FUNCTION_ALIAS
+ 2147483643:TCP_RXT_CLAMP
+ 2147483645:TCP_LOG_DETAILED
+ 2147483647:TCP_LOG_RATIO
+
+ENUM sb_wake_type
+ 1:RCV
+ 2:SND
+
+ENUM output_way
+ 0:NORMAL
+ 1:PERSIST
+ 2:HPTS
+ 3:OUTPUT
+ 4:DROP_SYNFIN
+ 5:DROP_MISSING_TS
+ 6:COMPLETED
+
+ENUM collapse_dir
+ 0:UNCOLLAPSED
+ 1:COLLAPSED
+ 2:RSM_MARKED
+ 3:SPLIT
+
+/*
+ * Flags
+ */
+
+FLAG tcp_flags
+ 0x01:FIN
+ 0x02:SYN
+ 0x04:RST
+ 0x08:PUSH
+ 0x10:ACK
+ 0x20:URG
+ 0x40:ECE
+ 0x80:CWR
+ 0x100:AE
+
+FLAG sb_flags
+ 0x01:SB_TLS_RX
+ 0x02:SB_TLS_RX_RUNNING
+ 0x04:SB_WAIT
+ 0x08:SB_SEL
+ 0x10:SB_ASYNC
+ 0x20:SB_UPCALL
+ 0x40:SB_AUTOLOWAT
+ 0x80:SB_AIO
+ 0x100:SB_KNOTE
+ 0x200:SB_NOCOALESCE
+ 0x400:SB_IN_TOE
+ 0x800:SB_AUTOSIZE
+ 0x2000:SB_AIO_RUNNING
+ 0x4000:SB_SPLICED
+ 0x8000:SB_TLS_RX_RESYNC
+
+FLAG sb_state
+ 0x10:SBS_CANTSENDMORE
+ 0x20:SBS_CANTRCVMORE
+ 0x40:SBS_RCVATMARK
+
+FLAG sendfile_flags
+ 0x01:SF_NODISKIO
+ 0x02:SF_MNOWAIT
+ 0x08:SF_USER_READAHEAD
+ 0x10:SF_NOCACHE
+
+FLAG prus_flags
+ 0x1:PRUS_OOB
+ 0x2:PRUS_EOF
+ 0x4:PRUS_MORETOCOME
+ 0x8:PRUS_NOTREADY
+
+FLAG conn_params_flags
+ 0x01:CP_SACK_PERMIT
+ 0x02:CP_TIMESTAMPS
+ 0x04:CP_ECN
+ 0x08:CP_ACE
+ 0x10:CP_FASTOPEN
+ 0x20:CP_SIGNATURE
+ 0x40:CP_WINDOW_SCALE
+
+/*
+ * Structures - grouped by event category
+ */
+
+/* Connection / Session */
+STRUCT SESSION_CREATE
+ tp:void*
+STRUCT CONN_SET_IP_V4
+ src_addr:in_addr_t
+ src_port:uint16_t:ntohs
+ dst_addr:in_addr_t
+ dst_port:uint16_t:ntohs
+STRUCT CONN_SET_IP_V6
+ src_addr:in6_addr_t
+ src_port:uint16_t:ntohs
+ dst_addr:in6_addr_t
+ dst_port:uint16_t:ntohs
+STRUCT LOG_ID
+ log_id:char[64]
+STRUCT CONN_PARAMS
+ iss:uint32_t
+ irs:uint32_t
+ mss:uint32_t
+ snd_wscale:uint8_t
+ rcv_wscale:uint8_t
+ flags:uint8_t:flag_conn_params_flags
+ cc_algo:char[16]
+ tcp_stack:char[32]
+STRUCT TCP_STACK_CHANGE
+ tcp_stack:char[32]
+STRUCT CC_ALGO_CHANGE
+ cc_algo:char[16]
+STRUCT MSS
+ mss:uint32_t
+STRUCT CONN_OWNER
+ pid:int32_t
+
+/* Data plane */
+STRUCT IN
+ seq:uint32_t
+ ack:uint32_t
+ flags:uint16_t:flag_tcp_flags
+ win_raw:uint16_t
+ win:uint32_t
+ len:uint32_t
+ state:uint8_t:enum_tcp_state
+ th_off:uint8_t
+ th_sum:uint16_t:hex
+ th_urp:uint16_t
+ rcv_time_us:uint64_t
+STRUCT OUT
+ seq:uint32_t
+ ack:uint32_t
+ flags:uint16_t:flag_tcp_flags
+ win_raw:uint16_t
+ win:uint32_t
+ len:uint32_t
+ cwnd:uint32_t
+ state:uint8_t:enum_tcp_state
+ th_off:uint8_t
+ th_sum:uint16_t:hex
+ th_urp:uint16_t
+STRUCT USER_SEND
+ error:int32_t
+ flags:uint32_t:flag_prus_flags
+ state:uint8_t:enum_tcp_state
+ sndbuf_used:uint32_t
+ sndbuf_avail:uint32_t
+ send_len:uint32_t
+STRUCT SENDFILE
+ offset:uint64_t
+ length:uint64_t
+ flags:uint32_t:flag_sendfile_flags
+STRUCT ACK
+ acked:uint32_t
+ snd_una:uint32_t
+ inflight:uint32_t
+ sacked:uint32_t
+ nsegs:uint32_t
+ rcv_time_us:uint64_t
+STRUCT SACK
+ acked:uint32_t
+ snd_una:uint32_t
+ inflight:uint32_t
+ sacked:uint32_t
+ nsegs:uint32_t
+ rcv_time_us:uint64_t
+ num_blocks:uint8_t
+ blk1_start:uint32_t
+ blk1_end:uint32_t
+ blk2_start:uint32_t
+ blk2_end:uint32_t
+ blk3_start:uint32_t
+ blk3_end:uint32_t
+ blk4_start:uint32_t
+ blk4_end:uint32_t
+
+/* Congestion control / Window */
+STRUCT CWND
+ cwnd:uint32_t
+ ssthresh:uint32_t
+STRUCT CC_PARAMS
+ beta:uint32_t
+ beta_ecn:uint32_t
+STRUCT ENTER_RECOVERY
+ cwnd:uint32_t
+ ssthresh:uint32_t
+STRUCT EXIT_RECOVERY
+ cwnd:uint32_t
+ ssthresh:uint32_t
+STRUCT PRR_STATE
+ prr_out:uint32_t
+ recovery_fs:uint32_t
+ prr_sndcnt:uint32_t
+ prr_delivered:uint32_t
+ sacked:uint32_t
+ holes_rxt:uint32_t
+ inflight:uint32_t
+STRUCT WINDOW_COLLAPSE
+ count:uint32_t
+ direction:uint8_t:enum_collapse_dir
+ inflight:uint32_t
+ has_collapsed:uint8_t
+STRUCT HYSTART
+ round:uint32_t
+ high_seq:uint32_t
+ snd_max:uint32_t
+ mod:uint8_t
+STRUCT GOODPUT
+ gp_bw:uint64_t
+ raw_bw:uint64_t
+ utim:uint64_t
+ gp_srtt:uint32_t
+ lowest_rtt:uint32_t
+
+/* RTT */
+STRUCT RTT
+ rtt:uint32_t
+ srtt:uint32_t
+ path_rtt:uint32_t
+ lowest_rtt:uint32_t
+ gp_filter_min:uint32_t
+ gp_srtt:uint32_t
+
+STRUCT RTT_SAMPLE
+ rtt:uint32_t
+
+/* Loss / Retransmission */
+STRUCT RTO
+ timeout_num:int32_t
+ min_rtt:uint32_t
+ rtt:uint32_t
+ segment_len:uint32_t
+ inflight:uint32_t
+ pkts_out:uint32_t
+STRUCT RETRANSMIT_REASON
+ threshold:uint32_t
+ dupack_count:uint32_t
+ rsm_start:uint32_t
+ rsm_end:uint32_t
+ inflight:uint32_t
+STRUCT DSACK
+ dsack_round_seen:uint8_t
+ dsack_round_end:uint32_t
+ num_dsack:uint32_t
+ dsack_persist:uint32_t
+ mod:uint8_t
+ round:uint32_t
+
+/* Scheduling / Timers */
+STRUCT TIMER_START
+ timeout_us:uint64_t
+ reason:uint8_t:enum_timer_reason
+STRUCT TIMER_CANCEL
+ reason:uint8_t:enum_timer_reason
+STRUCT TIMER_FIRED
+ reason:uint8_t:enum_timer_reason
+STRUCT TIMER_PROCESSED
+ reason:uint8_t:enum_timer_reason
+STRUCT SB_WAKE
+ sb_flags:uint32_t:flag_sb_flags
+ len:uint32_t
+ sb_state:uint32_t:flag_sb_state
+ type:uint8_t:enum_sb_wake_type
+
+/* Pacing */
+STRUCT PACING_CALC
+ slot:uint32_t
+ bw_est:uint64_t
+ method:uint8_t
+ quality:uint8_t
+ in_slow_start:uint8_t
+ inflight:uint32_t
+STRUCT PACE_STATE
+ rc_pace_min_segs:uint32_t
+ rc_pace_max_segs:uint32_t
+STRUCT HW_PACE_RATE
+ requested_rate:uint64_t
+ hw_rate:uint64_t
+ error:int32_t
+ mod:uint8_t
+
+/* Configuration */
+STRUCT SOCKET_OPT_UINT32
+ option_num:uint32_t:enum_socket_option
+ option_val:uint32_t
+STRUCT SOCKET_OPT_ERR
+ option_num:uint32_t:enum_socket_option
+ err:int32_t
+
+/* Output */
+STRUCT OUTPUT_DECISION
+ did_output:int32_t
+ nxt_pkt:int32_t
+ way_out:int32_t:enum_output_way
+ nsegs:int32_t
+ inflight:uint32_t
+ in_persist:uint8_t
+
+/*
+ * Events - grouped by category, renumbered for logical ordering
+ */
+
+/* Session */
+EVENT SESSION_CREATE 4294967294 INFO SESSION SESSION_CREATE
+ "Session created (tp=%1)"
+
+/* Connection (0-99) */
+EVENT CONN_SET_IP_V4 0 INFO CONN CONN_SET_IP_V4
+ "IPv4 connection %1:%2 -> %3:%4"
+EVENT CONN_SET_IP_V6 1 INFO CONN CONN_SET_IP_V6
+ "IPv6 connection %1:%2 -> %3:%4"
+EVENT LOG_ID 2 INFO CONN LOG_ID
+ "Log ID assigned: %1"
+EVENT CONN_PARAMS 3 INFO CONN CONN_PARAMS
+ "Connection parameters: ISS %1, IRS %2, MSS %3, snd_wscale %4, rcv_wscale %5, flags %6, cc %7, stack %8"
+EVENT MSS 4 INFO CONN MSS
+ "MSS set to %1 bytes"
+EVENT CONN_OWNER 5 INFO CONN CONN_OWNER
+ "Connection owned by PID %1"
+EVENT TCP_STACK_CHANGE 6 INFO CONN TCP_STACK_CHANGE
+ "TCP stack changed to %1"
+EVENT CC_ALGO_CHANGE 7 INFO CONN,CC CC_ALGO_CHANGE
+ "CC algorithm changed to %1"
+
+/* Data plane (100-199) */
+EVENT IN 100 VERBOSE RX IN
+ "Received %6 bytes (seq %1, ack %2, win %5, flags %3) in state %7 rcv_time %11 us"
+EVENT OUT 101 VERBOSE TX OUT
+ "Sent %6 bytes (seq %1, ack %2, win %5, flags %3) in state %8 cwnd %7"
+EVENT ACK 102 VERBOSE RX,CC ACK
+ "Acknowledged %1 bytes to %2 (%3 in flight, %4 sacked, %5 segments, rcv_time %6 us)"
+EVENT SACK 103 VERBOSE RX,CC SACK
+ "SACKed %1 bytes to %2 (%3 in flight, %4 sacked, %5 segs, rcv_time %6 us, %7 blocks: [%8-%9] [%10-%11] [%12-%13] [%14-%15])"
+EVENT USER_SEND 104 INFO TX,APP USER_SEND
+ "Application send of %6 bytes in state %3 (%4 used, %5 available in sndbuf, flags %2, error %1)"
+EVENT SENDFILE 105 INFO TX,APP SENDFILE
+ "Sendfile %2 bytes at offset %1 (flags %3)"
+
+/* Congestion control / Window (200-299) */
+EVENT CWND 200 VERBOSE CC CWND
+ "Congestion window now %1 (ssthresh %2)"
+EVENT CC_PARAMS 201 INFO CC CC_PARAMS
+ "CC parameters changed: beta %1, beta_ecn %2"
+EVENT ENTER_RECOVERY 202 WARN CC,LOSS ENTER_RECOVERY
+ "Entered loss recovery (cwnd %1, ssthresh %2)"
+EVENT EXIT_RECOVERY 203 WARN CC,LOSS EXIT_RECOVERY
+ "Exited loss recovery (cwnd %1, ssthresh %2)"
+EVENT PRR_STATE 204 TRACE CC,LOSS PRR_STATE
+ "PRR recovery: %1 sent of %3 allowed, %4 delivered of %2 flight (%5 sacked, %6 retransmitted, %7 in flight)"
+EVENT WINDOW_COLLAPSE 205 WARN CC WINDOW_COLLAPSE
+ "Peer window %2: %1 segments affected (%3 bytes in flight)"
+EVENT HYSTART 206 TRACE CC HYSTART
+ "Slow start round %1 completed (high_seq %2, snd_max %3, event %4)"
+EVENT GOODPUT 207 INFO CC GOODPUT
+ "Goodput sample %2 bps (EWMA %1 bps) over %3 us, gp_srtt %4 us, lowest_rtt %5 us"
+
+/* RTT (300-399) */
+EVENT RTT 300 INFO RTT RTT
+ "RTT sample %1 us applied to srtt %2 us (path %3 us, all-time low %4 us, GP min %5 us, gp_srtt %6 us)"
+EVENT RTT_SAMPLE 301 TRACE RTT RTT_SAMPLE
+ "RTT sample %1 us"
+
+/* Loss / Retransmission (400-499) */
+EVENT RTO 400 WARN LOSS,TX RTO
+ "Timeout #%1 (rtt %3 us, min_rtt %2 us, %4 byte segment, %5 in flight, %6 pkts out)"
+EVENT RETRANSMIT_REASON 401 WARN LOSS,TX RETRANSMIT_REASON
+ "Retransmitting bytes %3-%4 (threshold %1, %2 dupacks, %5 in flight)"
+EVENT DSACK 402 INFO LOSS,RX DSACK
+ "DSACK detected: %3 total in round %6 (persist %4)"
+
+/* Timers (500-599) */
+EVENT TIMER_START 500 VERBOSE TIMER TIMER_START
+ "Started %2 timer for %1 us"
+EVENT TIMER_CANCEL 501 VERBOSE TIMER TIMER_CANCEL
+ "Canceled %1 timer"
+EVENT TIMER_FIRED 502 VERBOSE TIMER TIMER_FIRED
+ "Fired %1 timer"
+EVENT SB_WAKE 503 VERBOSE APP SB_WAKE
+ "Socket buffer %4 wakeup (%2 bytes, flags %1, state %3)"
+EVENT TIMER_PROCESSED 504 VERBOSE TIMER TIMER_PROCESSED
+ "Processed %1 timer"
+
+/* Pacing (600-699) */
+EVENT PACING_CALC 600 VERBOSE PACING PACING_CALC
+ "Pacing delay %1 us at %2 bps (method %3, quality %4, %6 in flight)"
+EVENT PACE_STATE 601 TRACE PACING PACE_STATE
+ "Pacing burst size: min %1, max %2 bytes per segment"
+EVENT HW_PACE_RATE 602 INFO PACING HW_PACE_RATE
+ "Hardware pacer set to %2 bps (requested %1 bps, error %3)"
+
+/* Configuration (700-799) */
+EVENT SOCKET_OPT_UINT32 700 INFO APP,CONN SOCKET_OPT_UINT32
+ "Socket option %1 set to %2"
+EVENT SOCKET_OPT_ERR 701 WARN APP,CONN SOCKET_OPT_ERR
+ "Socket option %1 failed with error %2"
+
+/* Output (800-899) */
+EVENT OUTPUT_DECISION 800 TRACE TX OUTPUT_DECISION
+ "Segment processing done: sent %4 segments via %3 (%5 in flight, persist %6)"
diff --git a/share/man/man4/tcp.4 b/share/man/man4/tcp.4
--- a/share/man/man4/tcp.4
+++ b/share/man/man4/tcp.4
@@ -31,7 +31,7 @@
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE.
.\"
-.Dd May 7, 2026
+.Dd May 14, 2026
.Dt TCP 4
.Os
.Sh NAME
@@ -145,6 +145,24 @@
See
.Xr mod_cc 4
for details.
+.It Dv TCP_EVENTLOG
+Enable or disable per-connection event logging via the kernel eventlog framework.
+This option accepts a single integer argument: a non-zero value enables event
+logging for the connection, and zero disables it.
+.Pp
+When enabled, the TCP stack emits structured events for the connection
+.Pq e.g., segment transmission and reception, RTT samples, congestion window changes, retransmissions
+that can be captured by the
+.Xr elog 1
+user-space tool.
+Enabling eventlog on a connection also triggers an immediate dump of the
+current connection state, so that a subscriber sees the full context even if
+it attached after the connection was established.
+.Pp
+The system-wide default for new connections is controlled by the
+.Va kern.eventlog.tcp.default
+sysctl, documented in
+.Xr eventlog 9 .
.It Dv TCP_FASTOPEN
Enable or disable TCP Fast Open (TFO).
To use this option, the kernel must be built with the
@@ -1092,6 +1110,7 @@
when trying to use a TCP function block that is not available;
.El
.Sh SEE ALSO
+.Xr elog 1 ,
.Xr getsockopt 2 ,
.Xr setfib 2 ,
.Xr socket 2 ,
@@ -1110,6 +1129,7 @@
.Xr tcp_rack 4 ,
.Xr setkey 8 ,
.Xr sysctl 8 ,
+.Xr eventlog 9 ,
.Xr tcp_functions 9
.Rs
.%A "V. Jacobson"
diff --git a/sys/kern/kern_sendfile.c b/sys/kern/kern_sendfile.c
--- a/sys/kern/kern_sendfile.c
+++ b/sys/kern/kern_sendfile.c
@@ -1130,12 +1130,12 @@
td);
sendfile_iodone(sfio, NULL, 0, error);
}
-#ifdef TCP_REQUEST_TRK
if (so->so_proto->pr_protocol == IPPROTO_TCP) {
- /* log the sendfile call to the TCP log, if enabled */
+ tcp_eventlog_sendfile(so, offset, nbytes, flags);
+#ifdef TCP_REQUEST_TRK
tcp_log_sendfile(so, offset, nbytes, flags);
- }
#endif
+ }
m = NULL;
if (error)
goto done;
diff --git a/sys/modules/cc/cc_cubic/Makefile b/sys/modules/cc/cc_cubic/Makefile
--- a/sys/modules/cc/cc_cubic/Makefile
+++ b/sys/modules/cc/cc_cubic/Makefile
@@ -2,4 +2,6 @@
KMOD= cc_cubic
SRCS= cc_cubic.c opt_kern_tls.h
+EVENTLOG_SCHEMA= tcp_eventlog_schema.src
+
.include <bsd.kmod.mk>
diff --git a/sys/modules/cc/cc_newreno/Makefile b/sys/modules/cc/cc_newreno/Makefile
--- a/sys/modules/cc/cc_newreno/Makefile
+++ b/sys/modules/cc/cc_newreno/Makefile
@@ -2,4 +2,6 @@
KMOD= cc_newreno
SRCS= cc_newreno.c opt_kern_tls.h
+EVENTLOG_SCHEMA= tcp_eventlog_schema.src
+
.include <bsd.kmod.mk>
diff --git a/sys/modules/tcp/bbr/Makefile b/sys/modules/tcp/bbr/Makefile
--- a/sys/modules/tcp/bbr/Makefile
+++ b/sys/modules/tcp/bbr/Makefile
@@ -11,6 +11,8 @@
SRCS+= opt_kern_tls.h
SRCS+= opt_ratelimit.h
+EVENTLOG_SCHEMA= tcp_eventlog_schema.src
+
#
# Enable full debugging
#
diff --git a/sys/modules/tcp/rack/Makefile b/sys/modules/tcp/rack/Makefile
--- a/sys/modules/tcp/rack/Makefile
+++ b/sys/modules/tcp/rack/Makefile
@@ -11,6 +11,8 @@
SRCS+= opt_kern_tls.h
SRCS+= opt_ratelimit.h
+EVENTLOG_SCHEMA= tcp_eventlog_schema.src
+
#
# Enable full debugging
#
diff --git a/sys/netinet/cc/cc_cubic.c b/sys/netinet/cc/cc_cubic.c
--- a/sys/netinet/cc/cc_cubic.c
+++ b/sys/netinet/cc/cc_cubic.c
@@ -69,6 +69,8 @@
#include <netinet/tcp_var.h>
#include <netinet/tcp_log_buf.h>
#include <netinet/tcp_hpts.h>
+#include <sys/eventlog.h>
+#include <eventlog/tcp_eventlog.h>
#include <netinet/cc/cc.h>
#include <netinet/cc/cc_cubic.h>
#include <netinet/cc/cc_module.h>
@@ -123,9 +125,11 @@
*/
struct tcpcb *tp;
+ tp = ccv->tp;
+ TCP_EVENTLOG_HYSTART_LOG(tp->t_eventlog_session,
+ cubicd->css_current_round, 0, tp->snd_max, mod);
if (hystart_bblogs == 0)
return;
- tp = ccv->tp;
if (tcp_bblogging_on(tp)) {
union tcp_log_stackspecific log;
struct timeval tv;
diff --git a/sys/netinet/cc/cc_newreno.c b/sys/netinet/cc/cc_newreno.c
--- a/sys/netinet/cc/cc_newreno.c
+++ b/sys/netinet/cc/cc_newreno.c
@@ -79,6 +79,8 @@
#include <netinet/tcp_var.h>
#include <netinet/tcp_log_buf.h>
#include <netinet/tcp_hpts.h>
+#include <sys/eventlog.h>
+#include <eventlog/tcp_eventlog.h>
#include <netinet/cc/cc.h>
#include <netinet/cc/cc_module.h>
#include <netinet/cc/cc_newreno.h>
@@ -133,9 +135,13 @@
*/
struct tcpcb *tp;
- if (hystart_bblogs == 0)
+ if ((mod != 7) && (mod != 2) && (mod != 6))
return;
tp = ccv->tp;
+ TCP_EVENTLOG_HYSTART_LOG(tp->t_eventlog_session,
+ nreno->css_current_round, 0, tp->snd_max, mod);
+ if (hystart_bblogs == 0)
+ return;
if (tcp_bblogging_on(tp)) {
union tcp_log_stackspecific log;
struct timeval tv;
diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h
--- a/sys/netinet/tcp.h
+++ b/sys/netinet/tcp.h
@@ -219,6 +219,7 @@
#define TCP_PROC_ACCOUNTING 76 /* Do accounting on tcp cpu usage and counts */
#define TCP_USE_CMP_ACKS 77 /* The transport can handle the Compressed mbuf acks */
#define TCP_PERF_INFO 78 /* retrieve accounting counters */
+#define TCP_EVENTLOG 81 /* enable/disable eventlog */
#define TCP_KEEPINIT 128 /* N, time to establish connection */
#define TCP_KEEPIDLE 256 /* L,N,X start keeplives after this period */
#define TCP_KEEPINTVL 512 /* L,N interval between keepalives */
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -107,6 +107,8 @@
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_log_buf.h>
+#include <netinet/tcp_hpts.h>
+#include <eventlog/tcp_eventlog.h>
#include <netinet6/tcp6_var.h>
#include <netinet/tcpip.h>
#include <netinet/cc/cc.h>
@@ -1513,6 +1515,18 @@
TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
tlen, NULL, true);
+ TCP_EVENTLOG_IN_LOG(tp->t_eventlog_session,
+ th->th_seq,
+ th->th_ack,
+ thflags,
+ th->th_win,
+ (uint32_t)th->th_win << tp->snd_scale,
+ tlen,
+ tp->t_state,
+ th->th_off,
+ ntohs(th->th_sum),
+ th->th_urp,
+ tcp_get_u64_usecs(NULL));
if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
@@ -1805,7 +1819,19 @@
* typically means increasing the congestion
* window.
*/
+ {
+ uint32_t cwnd_before = tp->snd_cwnd;
cc_ack_received(tp, th, nsegs, CC_ACK);
+ TCP_EVENTLOG_ACK_LOG(tp->t_eventlog_session,
+ acked, th->th_ack,
+ tp->snd_max - th->th_ack,
+ 0, nsegs,
+ tcp_get_u64_usecs(NULL));
+ if (tp->snd_cwnd != cwnd_before)
+ TCP_EVENTLOG_CWND_LOG(
+ tp->t_eventlog_session,
+ tp->snd_cwnd, tp->snd_ssthresh);
+ }
tp->snd_una = th->th_ack;
/*
@@ -2038,6 +2064,8 @@
tcp_ecn_input_syn_sent(tp, thflags, iptos);
+ tcp_eventlog_conn_params(tp);
+
/*
* Received <SYN,ACK> in SYN_SENT[*] state.
* Transitions:
@@ -2650,6 +2678,9 @@
}
/* Congestion signal before ack. */
cc_cong_signal(tp, th, CC_NDUPACK);
+ TCP_EVENTLOG_ENTER_RECOVERY_LOG(
+ tp->t_eventlog_session,
+ tp->snd_cwnd, tp->snd_ssthresh);
cc_ack_received(tp, th, nsegs, CC_DUPACK);
tcp_timer_activate(tp, TT_REXMT, 0);
tp->t_rtttime = 0;
@@ -2953,7 +2984,18 @@
* control related information. This typically means increasing
* the congestion window.
*/
+ {
+ uint32_t cwnd_before = tp->snd_cwnd;
cc_ack_received(tp, th, nsegs, CC_ACK);
+ TCP_EVENTLOG_ACK_LOG(tp->t_eventlog_session,
+ acked, th->th_ack,
+ tp->snd_max - th->th_ack,
+ 0, nsegs,
+ tcp_get_u64_usecs(NULL));
+ if (tp->snd_cwnd != cwnd_before)
+ TCP_EVENTLOG_CWND_LOG(tp->t_eventlog_session,
+ tp->snd_cwnd, tp->snd_ssthresh);
+ }
if (acked > sbavail(&so->so_snd)) {
if (tp->snd_wnd >= sbavail(&so->so_snd))
@@ -2983,6 +3025,9 @@
if (IN_RECOVERY(tp->t_flags) &&
SEQ_GEQ(th->th_ack, tp->snd_recover)) {
cc_post_recovery(tp, th);
+ TCP_EVENTLOG_EXIT_RECOVERY_LOG(
+ tp->t_eventlog_session,
+ tp->snd_cwnd, tp->snd_ssthresh);
}
if (SEQ_GT(tp->snd_una, tp->snd_recover)) {
tp->snd_recover = tp->snd_una;
@@ -3616,6 +3661,14 @@
TCPSTAT_INC(tcps_rttupdated);
if (tp->t_rttupdated < UCHAR_MAX)
tp->t_rttupdated++;
+
+ TCP_EVENTLOG_RTT_LOG(tp->t_eventlog_session,
+ rtt * 1000,
+ (tp->t_srtt >> TCP_RTT_SHIFT) * 1000,
+ 0,
+ tp->t_rttlow,
+ 0, 0);
+
#ifdef STATS
stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT,
imax(0, rtt * 1000 / hz));
@@ -3850,6 +3903,7 @@
mss = max(mss, 64);
tp->t_maxseg = mss;
+ TCP_EVENTLOG_MSS_LOG(tp->t_eventlog_session, tp->t_maxseg);
if (tp->t_maxseg < V_tcp_mssdflt) {
/*
* The MSS is so small we should not process incoming
@@ -3912,6 +3966,7 @@
* XXXGL: shouldn't we reserve space for IP/IPv6 options?
*/
tp->t_maxseg = max(mss, 64);
+ TCP_EVENTLOG_MSS_LOG(tp->t_eventlog_session, tp->t_maxseg);
if (tp->t_maxseg < V_tcp_mssdflt) {
/*
* The MSS is so small we should not process incoming
diff --git a/sys/netinet/tcp_log_buf.h b/sys/netinet/tcp_log_buf.h
--- a/sys/netinet/tcp_log_buf.h
+++ b/sys/netinet/tcp_log_buf.h
@@ -573,6 +573,7 @@
#ifdef DDB
void db_print_bblog_entries(struct tcp_log_stailq *log_entries, int indent);
#endif
+const char *tcp_log_id_str(struct tcpcb *tp);
#else /* !TCP_BLACKBOX */
#define tcp_log_verbose (false)
@@ -586,6 +587,11 @@
return (NULL);
}
+static inline const char *
+tcp_log_id_str(struct tcpcb *tp __unused)
+{
+ return (NULL);
+}
#endif /* TCP_BLACKBOX */
#endif /* _KERNEL */
diff --git a/sys/netinet/tcp_log_buf.c b/sys/netinet/tcp_log_buf.c
--- a/sys/netinet/tcp_log_buf.c
+++ b/sys/netinet/tcp_log_buf.c
@@ -355,6 +355,13 @@
RB_GENERATE_STATIC(tcp_log_id_tree, tcp_log_id_bucket, tlb_rb, tcp_log_id_cmp)
+const char *
+tcp_log_id_str(struct tcpcb *tp)
+{
+
+ return (tp->t_lib != NULL) ? tp->t_lib->tlb_id : NULL;
+}
+
static __inline void
tcp_log_id_validate_tree_lock(int tree_locked)
{
@@ -500,6 +507,8 @@
tcp_log_id_validate_tree_lock(*tree_locked);
TCPID_BUCKET_LOCK_ASSERT(tlb);
+ tcp_eventlog_on_log_id_clear(tp);
+
/*
* Remove the node, clear the log bucket and node from the TCPCB, and
* decrement the bucket refcount. In the process, if this is the
@@ -987,6 +996,9 @@
SLIST_INSERT_HEAD(&tlb->tlb_head, tln, tln_list);
tp->t_lib = tlb;
tp->t_lin = tln;
+
+ tcp_eventlog_on_log_id_set(tp, tlb->tlb_id);
+
if (tp->t_lib->tlb_logstate > TCP_LOG_STATE_OFF) {
/* Clone in any logging */
diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@@ -79,6 +79,7 @@
#include <netinet/tcp_seq.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_log_buf.h>
+#include <eventlog/tcp_eventlog.h>
#include <netinet/tcp_syncache.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcpip.h>
@@ -1426,6 +1427,19 @@
else
lgb = NULL;
+ TCP_EVENTLOG_OUT_LOG(tp->t_eventlog_session,
+ ntohl(th->th_seq),
+ ntohl(th->th_ack),
+ tcp_get_flags(th),
+ ntohs(th->th_win),
+ (uint32_t)ntohs(th->th_win) << tp->rcv_scale,
+ len,
+ tp->snd_cwnd,
+ tp->t_state,
+ th->th_off,
+ ntohs(th->th_sum),
+ ntohs(th->th_urp));
+
/*
* Fill in IP length and desired time to live and
* send to IP level. There should be a better way
diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c
--- a/sys/netinet/tcp_stacks/bbr.c
+++ b/sys/netinet/tcp_stacks/bbr.c
@@ -92,6 +92,7 @@
#include <netinet/tcp_hpts.h>
#include <netinet/cc/cc.h>
#include <netinet/tcp_log_buf.h>
+#include <eventlog/tcp_eventlog.h>
#include <netinet/tcp_ratelimit.h>
#include <netinet/tcp_lro.h>
#ifdef TCP_OFFLOAD
@@ -2265,6 +2266,18 @@
TCP_LOG_IN, 0,
tlen, &log, true, &bbr->rc_tv);
}
+ TCP_EVENTLOG_IN_LOG(bbr->rc_tp->t_eventlog_session,
+ th->th_seq,
+ th->th_ack,
+ tcp_get_flags(th),
+ th->th_win,
+ (uint32_t)th->th_win << bbr->rc_tp->snd_scale,
+ tlen,
+ bbr->rc_tp->t_state,
+ th->th_off,
+ ntohs(th->th_sum),
+ th->th_urp,
+ tcp_tv_to_lusec(&bbr->rc_tv));
}
static void
@@ -8789,6 +8802,7 @@
(TF_RCVD_SCALE | TF_REQ_SCALE)) {
tp->rcv_scale = tp->request_r_scale;
}
+ tcp_eventlog_conn_params(tp);
tp->rcv_adv += min(tp->rcv_wnd,
TCP_MAXWIN << tp->rcv_scale);
/*
@@ -13505,6 +13519,18 @@
} else {
lgb = NULL;
}
+ TCP_EVENTLOG_OUT_LOG(tp->t_eventlog_session,
+ ntohl(th->th_seq),
+ ntohl(th->th_ack),
+ tcp_get_flags(th),
+ ntohs(th->th_win),
+ (uint32_t)ntohs(th->th_win) << tp->rcv_scale,
+ len,
+ tp->snd_cwnd,
+ tp->t_state,
+ th->th_off,
+ ntohs(th->th_sum),
+ ntohs(th->th_urp));
/*
* Fill in IP length and desired time to live and send to IP level.
* There should be a better way to handle ttl and tos; we could keep
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -88,6 +88,8 @@
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_log_buf.h>
+#include <sys/eventlog.h>
+#include <eventlog/tcp_eventlog.h>
#include <netinet/tcp_syncache.h>
#include <netinet/tcp_hpts.h>
#include <netinet/tcp_ratelimit.h>
@@ -2828,6 +2830,8 @@
static void
rack_log_rtt_sample_calc(struct tcp_rack *rack, uint32_t rtt, uint32_t send_time, uint32_t ack_time, int where)
{
+ TCP_EVENTLOG_RTT_SAMPLE_LOG(rack->rc_tp->t_eventlog_session, rtt);
+
if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
union tcp_log_stackspecific log;
struct timeval tv;
@@ -5413,6 +5417,21 @@
tp->t_ccv.flags |= CCF_USE_LOCAL_ABC;
CC_ALGO(tp)->ack_received(&tp->t_ccv, type);
}
+ if (tp->snd_cwnd != prior_cwnd)
+ TCP_EVENTLOG_CWND_LOG(tp->t_eventlog_session,
+ tp->snd_cwnd, tp->snd_ssthresh);
+ /*
+ * XXX rack_ack_received() does not currently have access to the
+ * caller's SACK blocks, so we always log a plain ACK event. Once
+ * the rack stack threads SACK info into rack_ack_received(), this
+ * call should switch to TCP_EVENTLOG_SACK_LOG when SACK info is
+ * present.
+ */
+ TCP_EVENTLOG_ACK_LOG(tp->t_eventlog_session,
+ acked, th_ack,
+ ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked),
+ rack->r_ctl.rc_sacked, nsegs,
+ tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time));
if (lgb) {
lgb->tlb_stackinfo.u_bbr.flex6 = tp->snd_cwnd;
}
@@ -5476,9 +5495,8 @@
static void
rack_exit_recovery(struct tcpcb *tp, struct tcp_rack *rack, int how)
{
- /*
- * Now exit recovery.
- */
+ TCP_EVENTLOG_EXIT_RECOVERY_LOG(tp->t_eventlog_session,
+ tp->snd_cwnd, tp->snd_ssthresh);
EXIT_RECOVERY(tp->t_flags);
}
@@ -5660,6 +5678,8 @@
CC_ALGO(tp)->cong_signal(&tp->t_ccv, type);
}
if ((in_rec_at_entry == 0) && IN_RECOVERY(tp->t_flags)) {
+ TCP_EVENTLOG_ENTER_RECOVERY_LOG(tp->t_eventlog_session,
+ tp->snd_cwnd, tp->snd_ssthresh);
rack_log_to_prr(rack, 15, cwnd_enter, line);
rack->r_ctl.dsack_byte_cnt = 0;
rack->r_ctl.retran_during_recovery = 0;
@@ -7654,6 +7674,13 @@
} else if ((tp->t_flags & TF_RCVD_TSTMP) == 0)
tp->t_flags &= ~TF_PREVVALID;
KMOD_TCPSTAT_INC(tcps_rexmttimeo);
+ TCP_EVENTLOG_RTO_LOG(tp->t_eventlog_session,
+ tp->t_rxtshift,
+ tp->t_rttlow,
+ tp->t_srtt >> TCP_RTT_SHIFT,
+ tp->t_maxseg,
+ tp->snd_max - tp->snd_una,
+ 0);
if ((tp->t_state == TCPS_SYN_SENT) ||
(tp->t_state == TCPS_SYN_RECEIVED))
rexmt = RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift];
@@ -8660,6 +8687,13 @@
RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
max(rack_rto_min, rtt + 2), rack_rto_max, rack->r_ctl.timer_slop);
rack_log_rtt_sample(rack, rtt);
+ TCP_EVENTLOG_RTT_LOG(tp->t_eventlog_session,
+ rtt,
+ tp->t_srtt,
+ rack->r_ctl.rack_rs.rs_us_rtt,
+ rack->r_ctl.rc_lowest_us_rtt,
+ get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
+ rack->r_ctl.rc_gp_srtt);
tp->t_softerror = 0;
}
@@ -12808,6 +12842,8 @@
tcp_ecn_input_syn_sent(tp, thflags, iptos);
+ tcp_eventlog_conn_params(tp);
+
if (SEQ_GT(th->th_ack, tp->snd_una)) {
/*
* We advance snd_una for the
@@ -24693,6 +24729,12 @@
goto free_uma;
}
rack_init_sysctls();
+ static const struct eventlog_provider_config tcp_eventlog_cfg = {
+ .dump_callback = tcp_eventlog_dump_state,
+ .default_changed = tcp_eventlog_default_changed,
+ };
+ __tcp_rack.tfb_eventlog_provider = eventlog_provider_create(
+ "tcp", &tcp_eventlog_cfg);
num_stacks = nitems(rack_stack_names);
err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK,
rack_stack_names, &num_stacks);
@@ -24700,6 +24742,9 @@
printf("Failed to register %s stack name for "
"%s module\n", rack_stack_names[num_stacks],
__XSTRING(MODNAME));
+ eventlog_provider_destroy(
+ __tcp_rack.tfb_eventlog_provider);
+ __tcp_rack.tfb_eventlog_provider = NULL;
sysctl_ctx_free(&rack_sysctl_ctx);
free_uma:
uma_zdestroy(rack_zone);
@@ -24726,6 +24771,8 @@
rack_mod_inited = false;
}
tcp_lro_dereg_mbufq();
+ eventlog_provider_destroy(__tcp_rack.tfb_eventlog_provider);
+ __tcp_rack.tfb_eventlog_provider = NULL;
err = 0;
break;
default:
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -102,6 +102,7 @@
#include <netinet/tcp_var.h>
#include <netinet/tcp_ecn.h>
#include <netinet/tcp_log_buf.h>
+#include <eventlog/tcp_eventlog.h>
#include <netinet/tcp_syncache.h>
#include <netinet/tcp_hpts.h>
#include <netinet/tcp_lro.h>
@@ -256,6 +257,14 @@
SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW,
&tcp_log_debug, 0, "Log errors caused by incoming TCP segments");
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, eventlog, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "TCP eventlog controls");
+
+static int tcp_eventlog_log_id_limit = 0;
+SYSCTL_INT(_net_inet_tcp_eventlog, OID_AUTO, log_id_limit, CTLFLAG_RW,
+ &tcp_eventlog_log_id_limit, 0,
+ "Max eventlog sessions to create for Log ID buckets (-1=unlimited, 0=none); decrements as used");
+
/*
* Target size of TCP PCB hash tables. Must be a power of two.
*
@@ -492,6 +501,287 @@
return (rblk);
}
+void
+tcp_eventlog_default_changed(struct eventlog_provider *provider __unused,
+ int value, void *arg __unused)
+{
+ VNET_ITERATOR_DECL(vnet_iter);
+ struct inpcb *inp;
+ int enabled;
+
+ if (value != -1 && value != 2)
+ return;
+
+ enabled = (value == 2) ? 1 : 0;
+
+ VNET_LIST_RLOCK();
+ VNET_FOREACH(vnet_iter) {
+ CURVNET_SET(vnet_iter);
+ struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo,
+ INPLOOKUP_RLOCKPCB);
+
+ while ((inp = inp_next(&inpi)) != NULL) {
+ struct tcpcb *tp = intotcpcb(inp);
+ if (tp->t_eventlog_session == NULL)
+ continue;
+ if (enabled &&
+ !eventlog_session_is_enabled(
+ tp->t_eventlog_session)) {
+ eventlog_session_set_enabled(
+ tp->t_eventlog_session, 1);
+ tcp_eventlog_dump_session(tp,
+ tcp_log_id_str(tp));
+ } else if (!enabled) {
+ eventlog_session_set_enabled(
+ tp->t_eventlog_session, 0);
+ }
+ }
+ CURVNET_RESTORE();
+ }
+ VNET_LIST_RUNLOCK();
+}
+
+/*
+ * Handle eventlog session when switching TCP stacks.
+ * - If the new stack has a provider and we have no session, create one.
+ * If the user previously set TCP_EVENTLOG (TF2_EVENTLOG_ENABLED),
+ * honor it.
+ * - If the new stack has no provider and we have a session, destroy it.
+ * Caller must hold inp lock.
+ */
+void
+tcp_ensure_eventlog_session_on_switch(struct tcpcb *tp,
+ struct tcp_function_block *tfb)
+{
+ struct tcp_eventlog_session_create create_payload;
+
+ INP_WLOCK_ASSERT(tptoinpcb(tp));
+ if (tfb->tfb_eventlog_provider != NULL) {
+ /* New stack has a provider: create a session if needed. */
+ if (tp->t_eventlog_session != NULL) {
+ /* Mid-connection swap between two eventlog stacks */
+ TCP_EVENTLOG_TCP_STACK_CHANGE_LOG(
+ tp->t_eventlog_session,
+ tfb->tfb_tcp_block_name);
+ return;
+ }
+ create_payload.tp = tp;
+ tp->t_eventlog_session = eventlog_session_create(
+ tfb->tfb_eventlog_provider,
+ tptoinpcb(tp)->inp_gencnt,
+ false,
+ &create_payload,
+ sizeof(create_payload));
+ if (tp->t_flags2 & TF2_EVENTLOG_ENABLED) {
+ eventlog_session_set_enabled(
+ tp->t_eventlog_session, 1);
+ tcp_eventlog_dump_session(tp,
+ tcp_log_id_str(tp));
+ }
+ } else {
+ /* New stack has no provider: end any existing session. */
+ if (tp->t_eventlog_session != NULL)
+ TCP_EVENTLOG_TCP_STACK_CHANGE_LOG(
+ tp->t_eventlog_session,
+ tfb->tfb_tcp_block_name);
+ eventlog_session_destroy(tp->t_eventlog_session);
+ tp->t_eventlog_session = NULL;
+ }
+}
+
+/*
+ * Emit a CONN_PARAMS event derived entirely from tp. Callers must invoke
+ * this only after the handshake has settled the negotiated parameters
+ * (window scales, MSS, ECN/SACK/etc). Reads the negotiated receive window
+ * scale from tp->request_r_scale (not tp->rcv_scale): the passive-open path
+ * emits before tp->rcv_scale is installed, and after the handshake completes
+ * tp->rcv_scale = tp->request_r_scale, so request_r_scale is the canonical
+ * source in both states.
+ */
+void
+tcp_eventlog_conn_params(struct tcpcb *tp)
+{
+ uint8_t flags = 0;
+ const char *cc_name = "";
+ const char *stack_name = "";
+
+ KASSERT(tp != NULL, ("tcp_eventlog_conn_params: tp == NULL"));
+
+ if (!TCP_EVENTLOG_CONN_PARAMS_ENABLED(tp->t_eventlog_session))
+ return;
+
+ if (tp->t_flags & TF_SACK_PERMIT)
+ flags |= TCP_EVENTLOG_FLAG_CP_SACK_PERMIT;
+ if (tp->t_flags & TF_RCVD_TSTMP)
+ flags |= TCP_EVENTLOG_FLAG_CP_TIMESTAMPS;
+ if (tp->t_flags2 & TF2_ECN_PERMIT)
+ flags |= TCP_EVENTLOG_FLAG_CP_ECN;
+ if (tp->t_flags2 & TF2_ACE_PERMIT)
+ flags |= TCP_EVENTLOG_FLAG_CP_ACE;
+ if (tp->t_flags & TF_FASTOPEN)
+ flags |= TCP_EVENTLOG_FLAG_CP_FASTOPEN;
+ if (tp->t_flags & TF_SIGNATURE)
+ flags |= TCP_EVENTLOG_FLAG_CP_SIGNATURE;
+ /*
+ * Window scaling is "negotiated" only when both sides exchanged the
+ * option in the SYN/SYN-ACK; without this flag, snd_wscale=rcv_wscale=0
+ * is ambiguous between "shift 0 negotiated" and "wscale absent".
+ */
+ if ((tp->t_flags & (TF_REQ_SCALE | TF_RCVD_SCALE)) ==
+ (TF_REQ_SCALE | TF_RCVD_SCALE))
+ flags |= TCP_EVENTLOG_FLAG_CP_WINDOW_SCALE;
+
+ if (CC_ALGO(tp) != NULL)
+ cc_name = CC_ALGO(tp)->name;
+ if (tp->t_fb != NULL)
+ stack_name = tp->t_fb->tfb_tcp_block_name;
+
+ TCP_EVENTLOG_CONN_PARAMS_LOG_ALWAYS(tp->t_eventlog_session,
+ tp->iss, tp->irs, tp->t_maxseg, tp->snd_scale, tp->request_r_scale,
+ flags, cc_name, stack_name);
+}
+
+/*
+ * Emit eventlog events for a TCP connection's current state (SESSION_CREATE,
+ * CONN_SET_IP, LOG_ID). Caller must hold the inp lock.
+ */
+void
+tcp_eventlog_dump_session(struct tcpcb *tp, const char *log_id)
+{
+ struct inpcb *inp;
+
+ if (tp == NULL || tp->t_eventlog_session == NULL)
+ return;
+
+ inp = tptoinpcb(tp);
+
+ TCP_EVENTLOG_SESSION_CREATE_LOG(tp->t_eventlog_session, tp);
+
+ /*
+ * Only emit the address once the local port is assigned; otherwise a
+ * socket that enables logging before bind()/connect() would log an
+ * all-zero 4-tuple, and a later connect() would log a second one.
+ * A zero foreign port is fine: that is a listening socket.
+ */
+#ifdef INET
+ if ((inp->inp_vflag & INP_IPV4) && inp->inp_lport != 0) {
+ TCP_EVENTLOG_CONN_SET_IP_V4_LOG(tp->t_eventlog_session,
+ inp->inp_laddr, inp->inp_lport,
+ inp->inp_faddr, inp->inp_fport);
+ }
+#endif
+#ifdef INET6
+ if ((inp->inp_vflag & INP_IPV6) && inp->inp_lport != 0) {
+ TCP_EVENTLOG_CONN_SET_IP_V6_LOG(tp->t_eventlog_session,
+ inp->inp_inc.inc6_laddr, inp->inp_lport,
+ inp->inp_inc.inc6_faddr, inp->inp_fport);
+ }
+#endif
+
+ if (log_id != NULL)
+ TCP_EVENTLOG_LOG_ID_LOG(tp->t_eventlog_session, log_id);
+
+ if (tp->iss != 0 || tp->irs != 0)
+ tcp_eventlog_conn_params(tp);
+
+ if (tp->t_owning_pid != 0)
+ TCP_EVENTLOG_CONN_OWNER_LOG(tp->t_eventlog_session,
+ tp->t_owning_pid);
+}
+
+/*
+ * Provider dump callback: emit current state for all TCP eventlog sessions.
+ * Invoked inline on the subscribing thread; uses normal event write macros
+ * which are automatically routed to the requesting subscriber.
+ */
+void
+tcp_eventlog_dump_state(struct eventlog_provider *provider __unused,
+ void *arg __unused)
+{
+ VNET_ITERATOR_DECL(vnet_iter);
+ struct inpcb *inp;
+
+ VNET_LIST_RLOCK();
+ VNET_FOREACH(vnet_iter) {
+ CURVNET_SET(vnet_iter);
+ struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo,
+ INPLOOKUP_RLOCKPCB);
+
+ while ((inp = inp_next(&inpi)) != NULL) {
+ struct tcpcb *tp = intotcpcb(inp);
+ if (tp->t_eventlog_session == NULL)
+ continue;
+ tcp_eventlog_dump_session(tp, tcp_log_id_str(tp));
+ }
+ CURVNET_RESTORE();
+ }
+ VNET_LIST_RUNLOCK();
+}
+
+void
+tcp_eventlog_on_log_id_set(struct tcpcb *tp, const char *log_id)
+{
+ struct eventlog_provider *prov = tp->t_fb->tfb_eventlog_provider;
+
+ if (eventlog_provider_get_default(prov) == 0) {
+ bool can_enable = true;
+ if (tcp_eventlog_log_id_limit >= 0) {
+ int old;
+ do {
+ old = atomic_load_acq_int(
+ &tcp_eventlog_log_id_limit);
+ if (old <= 0) {
+ can_enable = false;
+ break;
+ }
+ } while (!atomic_cmpset_int(
+ &tcp_eventlog_log_id_limit,
+ old, old - 1));
+ }
+ if (can_enable) {
+ eventlog_session_set_enabled(
+ tp->t_eventlog_session, 1);
+ tcp_eventlog_dump_session(tp, log_id);
+ }
+ } else {
+ tcp_eventlog_dump_session(tp, log_id);
+ }
+}
+
+void
+tcp_eventlog_on_log_id_clear(struct tcpcb *tp)
+{
+ if (tp != NULL &&
+ eventlog_provider_get_default(tp->t_fb->tfb_eventlog_provider) == 0)
+ eventlog_session_set_enabled(tp->t_eventlog_session, 0);
+}
+
+void
+tcp_eventlog_sendfile(struct socket *so, off_t offset, size_t nbytes,
+ int flags)
+{
+ struct inpcb *inp;
+ struct tcpcb *tp;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp_eventlog_sendfile: inp == NULL"));
+
+ /* quick check to see if logging is enabled for this connection */
+ tp = intotcpcb(inp);
+ if (tp->t_flags & TF_DISCONNECTED)
+ return;
+
+ INP_WLOCK(inp);
+ /* double check log state now that we have the lock */
+ if (tp->t_flags & TF_DISCONNECTED) {
+ INP_WUNLOCK(inp);
+ return;
+ }
+ TCP_EVENTLOG_SENDFILE_LOG(tp->t_eventlog_session,
+ offset, nbytes, flags);
+ INP_WUNLOCK(inp);
+}
+
void
tcp_switch_back_to_default(struct tcpcb *tp)
{
@@ -532,6 +822,7 @@
/* Now set in all the pointers */
tp->t_fb = tfb;
tp->t_fb_ptr = ptr;
+ tcp_ensure_eventlog_session_on_switch(tp, tfb);
return;
}
/*
@@ -566,6 +857,7 @@
/* And set in the pointers to the new */
tp->t_fb = tfb;
tp->t_fb_ptr = ptr;
+ tcp_ensure_eventlog_session_on_switch(tp, tfb);
}
static bool
@@ -1499,6 +1791,12 @@
/* Setup the tcp function block list */
TAILQ_INIT(&t_functions);
rw_init(&tcp_function_lock, "tcp_func_lock");
+ static const struct eventlog_provider_config tcp_eventlog_cfg = {
+ .dump_callback = tcp_eventlog_dump_state,
+ .default_changed = tcp_eventlog_default_changed,
+ };
+ tcp_def_funcblk.tfb_eventlog_provider = eventlog_provider_create("tcp",
+ &tcp_eventlog_cfg);
register_tcp_functions(&tcp_def_funcblk, M_WAITOK);
sx_init(&tcpoudp_lock, "TCP over UDP configuration");
#ifdef TCP_BLACKBOX
@@ -2094,6 +2392,20 @@
if (flags & TH_RST)
TCP_PROBE5(accept__refused, NULL, NULL, m, tp, nth);
lgb = NULL;
+ if (tp != NULL) {
+ TCP_EVENTLOG_OUT_LOG(tp->t_eventlog_session,
+ ntohl(nth->th_seq),
+ ntohl(nth->th_ack),
+ tcp_get_flags(nth),
+ ntohs(nth->th_win),
+ (uint32_t)ntohs(nth->th_win) << tp->rcv_scale,
+ 0,
+ tp->snd_cwnd,
+ tp->t_state,
+ nth->th_off,
+ ntohs(nth->th_sum),
+ ntohs(nth->th_urp));
+ }
if ((tp != NULL) && tcp_bblogging_on(tp)) {
if (INP_WLOCKED(inp)) {
union tcp_log_stackspecific log;
@@ -2258,6 +2570,12 @@
return (NULL);
}
tp->t_fb = listening_tcb->t_fb;
+ /*
+ * Inherit owning PID from the listening socket so that
+ * passively-accepted connections are attributed to the
+ * process that created the listener.
+ */
+ tp->t_owning_pid = listening_tcb->t_owning_pid;
} else {
tp->t_fb = V_tcp_func_set_ptr;
}
@@ -2362,9 +2680,24 @@
/* Initialize the per-TCPCB log data. */
tcp_log_tcpcbinit(tp);
#endif
+
+ /* Create per-connection eventlog session. Session ID is inp_gencnt. */
+ {
+ struct tcp_eventlog_session_create create_payload;
+
+ create_payload.tp = tp;
+ tp->t_eventlog_session = eventlog_session_create(
+ tp->t_fb->tfb_eventlog_provider,
+ inp->inp_gencnt,
+ false,
+ &create_payload,
+ sizeof(create_payload));
+ }
+
tp->t_pacing_rate = -1;
if (tp->t_fb->tfb_tcp_fb_init) {
if ((*tp->t_fb->tfb_tcp_fb_init)(tp, &tp->t_fb_ptr)) {
+ eventlog_session_destroy(tp->t_eventlog_session);
if (CC_ALGO(tp)->cb_destroy != NULL)
CC_ALGO(tp)->cb_destroy(&tp->t_ccv);
CC_DATA(tp) = NULL;
@@ -2472,6 +2805,12 @@
tcp_log_tcpcbfini(tp);
#endif
+ /*
+ * Destroy per-connection eventlog session (emits SESSION_END).
+ */
+ eventlog_session_destroy(tp->t_eventlog_session);
+ tp->t_eventlog_session = NULL;
+
/*
* If we got enough samples through the srtt filter,
* save the rtt and rttvar in the routing entry.
@@ -5016,6 +5355,13 @@
l->tlb_flex2 = option_val;
}
}
+ if (err != 0) {
+ TCP_EVENTLOG_SOCKET_OPT_ERR_LOG(tp->t_eventlog_session,
+ option_num, err);
+ } else {
+ TCP_EVENTLOG_SOCKET_OPT_UINT32_LOG(tp->t_eventlog_session,
+ option_num, option_val);
+ }
}
uint32_t
diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c
--- a/sys/netinet/tcp_syncache.c
+++ b/sys/netinet/tcp_syncache.c
@@ -92,6 +92,7 @@
#ifdef TCP_BLACKBOX
#include <netinet/tcp_log_buf.h>
#endif
+#include <eventlog/tcp_eventlog.h>
#ifdef TCP_OFFLOAD
#include <netinet/toecore.h>
#endif
@@ -852,6 +853,14 @@
/* Override flowlabel from in6_pcbconnect. */
inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
inp->inp_flow |= sc->sc_flowlabel;
+
+ if (inp->inp_vflag & INP_IPV6) {
+ TCP_EVENTLOG_CONN_SET_IP_V6_LOG(tp->t_eventlog_session,
+ inp->inp_inc.inc6_laddr,
+ inp->inp_lport,
+ inp->inp_inc.inc6_faddr,
+ inp->inp_fport);
+ }
}
#endif /* INET6 */
#if defined(INET) && defined(INET6)
@@ -876,6 +885,14 @@
error = in_pcbconnect(inp, &sin, thread0.td_ucred);
if (error != 0)
goto abort;
+
+ if (inp->inp_vflag & INP_IPV4) {
+ TCP_EVENTLOG_CONN_SET_IP_V4_LOG(tp->t_eventlog_session,
+ inp->inp_laddr,
+ inp->inp_lport,
+ inp->inp_faddr,
+ inp->inp_fport);
+ }
}
#endif /* INET */
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
@@ -961,6 +978,9 @@
*/
tcp_mss(tp, sc->sc_peer_mss);
+ /* Log once all handshake parameters are set. */
+ tcp_eventlog_conn_params(tp);
+
/*
* If the SYN,ACK was retransmitted, indicate that CWND to be
* limited to one segment in cc_conn_init().
@@ -999,6 +1019,17 @@
tcp_log_state_change(tp, tcp_get_bblog_state(sototcpcb(lso)));
}
#endif
+ /*
+ * Inherit eventlog enable state from the listener so that connections
+ * accepted on an eventlog-enabled listener are themselves observed.
+ * Enable the connection's session and dump its current state so
+ * subscribers see a complete view from the start.
+ */
+ if ((sototcpcb(lso)->t_flags2 & TF2_EVENTLOG_ENABLED) != 0) {
+ tp->t_flags2 |= TF2_EVENTLOG_ENABLED;
+ eventlog_session_set_enabled(tp->t_eventlog_session, 1);
+ tcp_eventlog_dump_session(tp, tcp_log_id_str(tp));
+ }
/*
* Copy and activate timers.
*/
diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c
--- a/sys/netinet/tcp_timer.c
+++ b/sys/netinet/tcp_timer.c
@@ -65,6 +65,7 @@
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_log_buf.h>
+#include <eventlog/tcp_eventlog.h>
#include <netinet/tcp_seq.h>
#include <netinet/cc/cc.h>
#ifdef INET6
@@ -626,6 +627,13 @@
tcp_free_sackholes(tp);
}
TCPSTAT_INC(tcps_rexmttimeo);
+ TCP_EVENTLOG_RTO_LOG(tp->t_eventlog_session,
+ tp->t_rxtshift,
+ tp->t_rttlow,
+ tp->t_srtt >> TCP_RTT_SHIFT,
+ tp->t_maxseg,
+ tp->snd_max - tp->snd_una,
+ 0);
if ((tp->t_state == TCPS_SYN_SENT) ||
(tp->t_state == TCPS_SYN_RECEIVED))
rexmt = tcp_rexmit_initial * tcp_backoff[tp->t_rxtshift];
@@ -822,6 +830,15 @@
return (rv);
}
+/* Map a tt_which onto the eventlog timer_reason enum. */
+static uint8_t const tcp_eventlog_timer_reason[TT_N] = {
+ [TT_REXMT] = TCP_EVENTLOG_TIMER_REASON_RXT,
+ [TT_PERSIST] = TCP_EVENTLOG_TIMER_REASON_PERSIST,
+ [TT_KEEP] = TCP_EVENTLOG_TIMER_REASON_KEEPALIVE,
+ [TT_2MSL] = TCP_EVENTLOG_TIMER_REASON_TWOMSL,
+ [TT_DELACK] = TCP_EVENTLOG_TIMER_REASON_DELAYACK,
+};
+
static void
tcp_bblog_timer(struct tcpcb *tp, tt_which which, tt_what what, uint32_t ticks)
{
@@ -886,8 +903,12 @@
tp->t_precisions[which] = 0;
tcp_bblog_timer(tp, which, TT_PROCESSING, 0);
+ TCP_EVENTLOG_TIMER_FIRED_LOG(tp->t_eventlog_session,
+ tcp_eventlog_timer_reason[which]);
if (tcp_timersw[which](tp)) {
tcp_bblog_timer(tp, which, TT_PROCESSED, 0);
+ TCP_EVENTLOG_TIMER_PROCESSED_LOG(tp->t_eventlog_session,
+ tcp_eventlog_timer_reason[which]);
if ((which = tcp_timer_next(tp, &precision)) != TT_N) {
MPASS(tp->t_state > TCPS_CLOSED);
callout_reset_sbt_on(&tp->t_callout,
@@ -920,9 +941,14 @@
what = TT_STARTING;
callout_when(tick_sbt * delta, 0, C_HARDCLOCK,
&tp->t_timers[which], &tp->t_precisions[which]);
+ TCP_EVENTLOG_TIMER_START_LOG(tp->t_eventlog_session,
+ (((uint64_t)delta * 1000000) + (hz - 1)) / hz,
+ tcp_eventlog_timer_reason[which]);
} else {
what = TT_STOPPING;
tp->t_timers[which] = SBT_MAX;
+ TCP_EVENTLOG_TIMER_CANCEL_LOG(tp->t_eventlog_session,
+ tcp_eventlog_timer_reason[which]);
}
tcp_bblog_timer(tp, which, what, delta);
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -92,6 +92,7 @@
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_log_buf.h>
+#include <eventlog/tcp_eventlog.h>
#include <netinet/tcpip.h>
#include <netinet/cc/cc.h>
#include <netinet/tcp_fastopen.h>
@@ -177,6 +178,12 @@
goto out;
}
tp->t_state = TCPS_CLOSED;
+ /*
+ * Record the PID of the process that opened this socket so the
+ * connection's eventlog can identify the owning application.
+ */
+ if (td != NULL && td->td_proc != NULL)
+ tp->t_owning_pid = td->td_proc->p_pid;
tcp_bblog_pru(tp, PRU_ATTACH, error);
INP_WUNLOCK(inp);
TCPSTATES_INC(TCPS_CLOSED);
@@ -897,6 +904,7 @@
u_int8_t incflagsav;
u_char vflagsav;
bool restoreflags;
+ u_int appended = 0;
INP_WLOCK(inp);
if (__predict_false(tp->t_flags & TF_DISCONNECTED)) {
@@ -1023,7 +1031,9 @@
if (!(flags & PRUS_OOB)) {
if (tp->t_acktime == 0)
tp->t_acktime = ticks;
+ appended = sbavail(&so->so_snd);
sbappendstream(&so->so_snd, m, flags);
+ appended = sbavail(&so->so_snd) - appended;
m = NULL;
if (nam && tp->t_state < TCPS_SYN_SENT) {
KASSERT(tp->t_state == TCPS_CLOSED,
@@ -1109,7 +1119,9 @@
*/
if (tp->t_acktime == 0)
tp->t_acktime = ticks;
+ appended = sbavail(&so->so_snd);
sbappendstream_locked(&so->so_snd, m, flags);
+ appended = sbavail(&so->so_snd) - appended;
SOCK_SENDBUF_UNLOCK(so);
m = NULL;
if (nam && tp->t_state < TCPS_SYN_SENT) {
@@ -1162,6 +1174,9 @@
&inp->inp_socket->so_snd,
TCP_LOG_USERSEND, error,
0, NULL, false);
+ TCP_EVENTLOG_USER_SEND_LOG(tp->t_eventlog_session,
+ error, flags, tp->t_state,
+ sbused(&so->so_snd), sbavail(&so->so_snd), appended);
out:
/*
@@ -1428,6 +1443,14 @@
if (error != 0)
return (error);
+ if (inp->inp_vflag & INP_IPV4) {
+ TCP_EVENTLOG_CONN_SET_IP_V4_LOG(tp->t_eventlog_session,
+ inp->inp_laddr,
+ inp->inp_lport,
+ inp->inp_faddr,
+ inp->inp_fport);
+ }
+
/* set the hash on the connection */
rss_proto_software_hash_v4(inp->inp_faddr, inp->inp_laddr,
inp->inp_fport, inp->inp_lport, IPPROTO_TCP,
@@ -1475,6 +1498,14 @@
if (error != 0)
return (error);
+ if (inp->inp_vflag & INP_IPV6) {
+ TCP_EVENTLOG_CONN_SET_IP_V6_LOG(tp->t_eventlog_session,
+ inp->inp_inc.inc6_laddr,
+ inp->inp_lport,
+ inp->inp_inc.inc6_faddr,
+ inp->inp_fport);
+ }
+
/* set the hash on the connection */
rss_proto_software_hash_v6(&inp->in6p_faddr,
&inp->in6p_laddr, inp->inp_fport, inp->inp_lport, IPPROTO_TCP,
@@ -1753,6 +1784,7 @@
*/
tp->t_fb = blk;
tp->t_fb_ptr = ptr;
+ tcp_ensure_eventlog_session_on_switch(tp, blk);
#ifdef TCP_OFFLOAD
if (tp->t_flags & TF_TOE) {
tcp_offload_ctloutput(tp, sopt->sopt_dir,
@@ -1946,6 +1978,8 @@
memcpy(&tp->t_ccv, &cc_mem, sizeof(struct cc_var));
/* Now attach the new, which takes a reference */
cc_attach(tp, algo);
+ TCP_EVENTLOG_CC_ALGO_CHANGE_LOG(tp->t_eventlog_session,
+ algo->name);
/* Ok now are we where we have gotten past any conn_init? */
if (TCPS_HAVEESTABLISHED(tp->t_state) && (CC_ALGO(tp)->conn_init != NULL)) {
/* Yep run the connection init for the new CC */
@@ -2148,6 +2182,8 @@
} else {
tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT;
}
+ TCP_EVENTLOG_MSS_LOG(tp->t_eventlog_session,
+ tp->t_maxseg);
} else
error = EINVAL;
goto unlock_and_done;
@@ -2200,6 +2236,24 @@
INP_WUNLOCK(inp);
break;
+ case TCP_EVENTLOG:
+ INP_WUNLOCK(inp);
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ return (error);
+ INP_WLOCK_RECHECK(inp);
+ if (optval)
+ tp->t_flags2 |= TF2_EVENTLOG_ENABLED;
+ else
+ tp->t_flags2 &= ~TF2_EVENTLOG_ENABLED;
+ eventlog_session_set_enabled(
+ tp->t_eventlog_session, optval);
+ if (optval)
+ tcp_eventlog_dump_session(tp,
+ tcp_log_id_str(tp));
+ goto unlock_and_done;
+
#ifdef KERN_TLS
case TCP_TXTLS_ENABLE:
INP_WUNLOCK(inp);
@@ -2547,6 +2601,16 @@
error = EINVAL;
break;
#endif
+ case TCP_EVENTLOG:
+ if (tp->t_eventlog_session != NULL)
+ optval = eventlog_session_is_enabled(
+ tp->t_eventlog_session);
+ else
+ optval = (tp->t_flags2 &
+ TF2_EVENTLOG_ENABLED) ? 1 : 0;
+ INP_WUNLOCK(inp);
+ error = sooptcopyout(sopt, &optval, sizeof(optval));
+ break;
#ifdef KERN_TLS
case TCP_TXTLS_MODE:
error = ktls_get_tx_mode(so, &optval);
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -444,6 +444,9 @@
struct tcp_log_stailq t_logs; /* Log buffer */
struct tcp_log_id_node *t_lin;
struct tcp_log_id_bucket *t_lib;
+ struct eventlog_session *t_eventlog_session;
+ /* Per-connection eventlog session */
+ pid_t t_owning_pid; /* PID of process that opened this conn */
const char *t_output_caller; /* Function that called tcp_output */
struct statsblob *t_stats; /* Per-connection stats */
/* Should these be a pointer to the arrays or an array? */
@@ -624,6 +627,7 @@
volatile uint32_t tfb_refcnt;
uint32_t tfb_flags;
uint8_t tfb_id;
+ struct eventlog_provider *tfb_eventlog_provider;
};
/* Maximum number of names each TCP function block can be registered with. */
@@ -855,6 +859,7 @@
#define TF2_PROC_SACK_PROHIBIT 0x00100000 /* Due to small MSS size do not process sack's */
#define TF2_IPSEC_TSO 0x00200000 /* IPSEC + TSO supported */
#define TF2_NO_ISS_CHECK 0x00400000 /* Don't check SEG.ACK against ISS */
+#define TF2_EVENTLOG_ENABLED 0x02000000 /* User requested eventlog enabled */
/* t_flags2 description for use with printf(9) %b identifier. */
#define TF2_BITS "\20" \
@@ -869,7 +874,7 @@
"\21TF2_SUPPORTS_MBUFQ\22TF2_MBUF_QUEUE_READY" \
"\23TF2_DONT_SACK_QUEUE\24TF2_CANNOT_DO_ECN" \
"\25TF2_PROC_SACK_PROHIBIT\26TF2_IPSEC_TSO" \
- "\27TF2_NO_ISS_CHECK"
+ "\27TF2_NO_ISS_CHECK\32TF2_EVENTLOG_ENABLED"
/*
* Structure to hold TCP options that are only used during segment
@@ -1276,6 +1281,7 @@
#ifdef SYSCTL_DECL
SYSCTL_DECL(_net_inet_tcp);
SYSCTL_DECL(_net_inet_tcp_sack);
+SYSCTL_DECL(_net_inet_tcp_eventlog);
MALLOC_DECLARE(M_TCPLOG);
#endif
@@ -1441,6 +1447,17 @@
int find_tcp_function_alias(struct tcp_function_block *blk, struct tcp_function_set *fs);
uint32_t tcp_get_srtt(struct tcpcb *tp, int granularity);
void tcp_switch_back_to_default(struct tcpcb *tp);
+void tcp_ensure_eventlog_session_on_switch(struct tcpcb *tp,
+ struct tcp_function_block *tfb);
+void tcp_eventlog_dump_state(struct eventlog_provider *provider, void *arg);
+void tcp_eventlog_default_changed(struct eventlog_provider *provider,
+ int value, void *arg);
+void tcp_eventlog_dump_session(struct tcpcb *tp, const char *log_id);
+void tcp_eventlog_on_log_id_set(struct tcpcb *tp, const char *log_id);
+void tcp_eventlog_on_log_id_clear(struct tcpcb *tp);
+void tcp_eventlog_sendfile(struct socket *so, off_t offset, size_t nbytes,
+ int flags);
+void tcp_eventlog_conn_params(struct tcpcb *tp);
struct tcp_function_block *
find_and_ref_tcp_fb(struct tcp_function_block *fs);
int tcp_default_ctloutput(struct tcpcb *tp, struct sockopt *sopt);
diff --git a/tests/sys/kern/elog_test.py b/tests/sys/kern/elog_test.py
--- a/tests/sys/kern/elog_test.py
+++ b/tests/sys/kern/elog_test.py
@@ -4,27 +4,430 @@
# SPDX-License-Identifier: BSD-2-Clause
#
-"""ATF tests for the elog(1) userspace utility.
+"""End-to-end ATF tests for the elog(1) userspace utility.
-Smoke tests for the elog binary CLI surface. These catch packaging
-regressions (missing binary, broken option parser, broken capture-file
-reader) without requiring /dev/eventlog or any provider to be present.
+Validates the full path: elog binary -> /dev/eventlog -> kernel eventlog
+framework. Complements kern_eventlog_test (which exercises kernel-internal
+APIs via a ktest module) by covering the real character-device + IOCTL
+surface.
-End-to-end coverage of the device interface and individual providers
-lives with the providers themselves; the framework's own kernel-side
-tests are in kern_eventlog_test.py.
+Most cases require root because /dev/eventlog rejects jailed callers
+(kern_eventlog.c eventlog_dev_open) and the CREATE IOCTL needs to allocate
+per-CPU buffers.
"""
+import os
+import socket
+import struct
import subprocess
+import time
+from collections import namedtuple
+from contextlib import contextmanager
from pathlib import Path
import pytest
from atf_python.utils import BaseTest
ELOG = "/usr/bin/elog"
+EVENTLOG_DEV = "/dev/eventlog"
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _sysctl_get(name):
+ """Return the current value of sysctl `name` as a string, or None if
+ the OID does not exist."""
+ r = subprocess.run(
+ ["sysctl", "-n", name],
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ if r.returncode != 0:
+ return None
+ return r.stdout.decode().strip()
+
+
+def _sysctl_set(name, value):
+ """Set sysctl `name` to `value`. Raises CalledProcessError on failure."""
+ subprocess.run(
+ ["sysctl", f"{name}={value}"],
+ stdout=subprocess.DEVNULL, stderr=subprocess.PIPE,
+ check=True)
+
+
+@contextmanager
+def sysctl_override(name, value):
+ """Save current value of sysctl `name`, set it to `value`, then
+ restore on exit. Skips the test if the OID is missing or the new
+ value is rejected."""
+ old = _sysctl_get(name)
+ if old is None:
+ pytest.skip(f"sysctl {name} not present")
+ try:
+ _sysctl_set(name, value)
+ except subprocess.CalledProcessError as e:
+ pytest.skip(f"cannot set {name}={value}: "
+ f"{e.stderr.decode().strip()}")
+ try:
+ yield old
+ finally:
+ # Restore unconditionally; ignore errors so cleanup can't mask
+ # the original assertion failure.
+ subprocess.run(
+ ["sysctl", f"{name}={old}"],
+ stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=False)
+
+
+def _require_eventlog_dev():
+ if not Path(EVENTLOG_DEV).is_char_device():
+ pytest.skip(f"{EVENTLOG_DEV} not present")
+
+
+def _pick_port(salt=0):
+ """Pseudo-random high port; deterministic across reruns of the same
+ pytest invocation but varies with the test process."""
+ return 20000 + ((os.getpid() + salt) % 30000)
+
+
+def _addr_for(family):
+ return "::1" if family == socket.AF_INET6 else "127.0.0.1"
+
+
+@contextmanager
+def loopback_pair(family, port, linger0=False):
+ """Open a listener+connected client pair on `(addr, port)` over loopback,
+ yield (server-accepted-conn, client) sockets, and close all three on
+ exit. Returns once the three-way handshake has completed (accept()
+ returns).
+
+ If linger0=True, the client is configured with SO_LINGER {1, 0} so
+ that close() emits a TCP RST and the PCB is reclaimed immediately
+ instead of going through TIME_WAIT.
+ """
+ addr = _addr_for(family)
+ srv = socket.socket(family, socket.SOCK_STREAM)
+ srv.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+ cli = socket.socket(family, socket.SOCK_STREAM)
+ if linger0:
+ # struct linger { int l_onoff; int l_linger; }
+ cli.setsockopt(socket.SOL_SOCKET, socket.SO_LINGER,
+ struct.pack("ii", 1, 0))
+ conn = None
+ try:
+ srv.bind((addr, port))
+ srv.listen(1)
+ cli.connect((addr, port))
+ conn, _ = srv.accept()
+ yield conn, cli
+ finally:
+ for s in (conn, cli, srv):
+ if s is not None:
+ try:
+ s.close()
+ except OSError:
+ pass
+
+
+def loopback_oneshot(family, port, payload):
+ """Drive a one-shot TCP loopback exchange and return what the server
+ received. Closes both ends before returning."""
+ addr = _addr_for(family)
+ with socket.socket(family, socket.SOCK_STREAM) as srv:
+ srv.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+ srv.bind((addr, port))
+ srv.listen(1)
+ with socket.socket(family, socket.SOCK_STREAM) as cli:
+ cli.connect((addr, port))
+ conn, _ = srv.accept()
+ try:
+ cli.sendall(payload)
+ cli.shutdown(socket.SHUT_WR)
+ data = b""
+ while True:
+ chunk = conn.recv(4096)
+ if not chunk:
+ break
+ data += chunk
+ return data
+ finally:
+ conn.close()
+
+
+@contextmanager
+def elog_capture(*args, capture_path=None):
+ """Start `elog -s [args] [-o capture_path]` in the background. The
+ caller supplies all `-c <provider>` subscriptions and any other
+ flags. If `capture_path` is None no `-o` is passed (live text mode
+ on stdout). Yields the Popen handle. Sends SIGTERM and reaps on
+ exit; asserts a clean (rc=0) shutdown.
+ """
+ cmd = [ELOG, "-s", *args]
+ if capture_path is not None:
+ cmd += ["-o", str(capture_path)]
+ proc = subprocess.Popen(
+ cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ # Give elog a tick to issue CREATE/GET_PROVIDERS before the caller
+ # drives traffic.
+ time.sleep(1)
+ if proc.poll() is not None:
+ err = proc.stderr.read().decode()
+ raise AssertionError(f"elog exited early: {err}")
+ try:
+ yield proc
+ finally:
+ if proc.poll() is None:
+ proc.terminate()
+ try:
+ _, err = proc.communicate(timeout=10)
+ except subprocess.TimeoutExpired:
+ proc.kill()
+ _, err = proc.communicate()
+ if proc.returncode != 0:
+ raise AssertionError(
+ f"elog exited rc={proc.returncode}; "
+ f"stderr={err.decode()}")
+
+
+def decode_capture(path):
+ """Run `elog -e -r <path>` and return its stdout."""
+ return subprocess.check_output(
+ [ELOG, "-e", "-r", str(path)], text=True)
+
+
+def decode_capture_or_empty(path):
+ """Like decode_capture, but if the binary capture file is 0 bytes
+ (no events were ever recorded, so elog never wrote the header)
+ return "". elog(1) rejects an empty file with `bad magic number`,
+ which is correct behaviour for replay but is not a test failure
+ when we deliberately set up a filter that drops every event.
+ """
+ if Path(path).stat().st_size == 0:
+ return ""
+ return decode_capture(path)
+
+
+def _assert_in_decode(decoded, needle, label, *, head_chars=2000):
+ """Assert that `needle` is a substring of `decoded`. On failure include
+ the head of the decoded text so the failure is self-debugging in the
+ kyua report."""
+ if needle in decoded:
+ return
+ pytest.fail(
+ f"{label}: missing literal {needle!r} in decoded capture\n"
+ f"--- decoded.txt (first {head_chars} chars) ---\n"
+ f"{decoded[:head_chars]}\n--- end ---")
+
+
+# ---------------------------------------------------------------------------
+# elog binary-format decoder (in-process, no fork to elog -r)
+# ---------------------------------------------------------------------------
+#
+# On-disk layout (host byte order; we use "<" for amd64):
+#
+# elog_binary_header (40 bytes, __packed):
+# char magic[4] == "ELOG"
+# uint32_t version == 1
+# uint64_t capture_start microseconds since boot at capture start
+# uint64_t start_utc_us wall-clock UTC microseconds at capture start
+# uint64_t event_count total events written
+# uint64_t dropped_events events dropped (kernel buffer overflow)
+#
+# uint32_t provider_count
+# provider_count x eventlog_provider_info (34 bytes, __packed):
+# uint16_t provider_id
+# char name[EVENTLOG_PROVIDER_NAME_MAX = 32]
+#
+# stream of:
+# eventlog_event_header (32 bytes, naturally aligned, no padding):
+# uint16_t event_length total record size including header
+# uint16_t cpu
+# uint16_t provider_id
+# uint16_t RESERVED
+# uint64_t timestamp microseconds since boot
+# uint64_t session_id
+# uint32_t event_id
+# int32_t thread_id
+# payload event_length - 32 bytes
+#
+# References:
+# FreeBSD/usr.bin/elog/elog.c elog_binary_header / write_binary_event
+# FreeBSD/sys/sys/eventlog_subscriber.h
+# eventlog_event_header / eventlog_provider_info
+# FreeBSD/include/eventlog/tcp_eventlog_schema.src
+# STRUCT layouts for payload parsing
+
+ELOG_BINARY_MAGIC = b"ELOG"
+ELOG_BINARY_VERSION = 1
+EVENTLOG_PROVIDER_NAME_MAX = 32
+
+# Framework-reserved event IDs (FreeBSD/sys/sys/eventlog.h).
+EVENTLOG_SESSION_CREATE_ID = 0xFFFFFFFE
+EVENTLOG_SESSION_END_ID = 0xFFFFFFFF
+
+# tcp provider event IDs (tcp_eventlog_schema.src).
+TCP_EVENT_CONN_SET_IP_V4 = 0
+TCP_EVENT_CONN_SET_IP_V6 = 1
+TCP_EVENT_CONN_PARAMS = 3
+TCP_EVENT_MSS = 4
+TCP_EVENT_IN = 100
+TCP_EVENT_OUT = 101
+
+ELOG_BINARY_HEADER_FMT = "<4sIQQQQ"
+ELOG_BINARY_HEADER_SIZE = struct.calcsize(ELOG_BINARY_HEADER_FMT) # 40
+ELOG_PROVIDER_INFO_FMT = f"<H{EVENTLOG_PROVIDER_NAME_MAX}s"
+ELOG_PROVIDER_INFO_SIZE = struct.calcsize(ELOG_PROVIDER_INFO_FMT) # 34
+ELOG_EVENT_HEADER_FMT = "<HHHHQQIi"
+ELOG_EVENT_HEADER_SIZE = struct.calcsize(ELOG_EVENT_HEADER_FMT) # 32
+assert ELOG_BINARY_HEADER_SIZE == 40
+assert ELOG_PROVIDER_INFO_SIZE == 34
+assert ELOG_EVENT_HEADER_SIZE == 32
+
+ElogFileHeader = namedtuple(
+ "ElogFileHeader",
+ "capture_start start_utc_us event_count dropped_events")
+ElogEventHeader = namedtuple(
+ "ElogEventHeader",
+ "event_length cpu provider_id timestamp session_id event_id thread_id")
+
+
+def parse_elog_binary(path):
+ """Parse a complete elog binary capture file. Returns a tuple
+ `(file_header, providers_by_id, events)` where:
+ file_header ElogFileHeader
+ providers_by_id dict[int -> str], e.g. {1: "tcp"}
+ events list of (ElogEventHeader, payload_bytes)
+
+ Raises AssertionError on any structural problem (bad magic,
+ truncated event, payload-size mismatch with the recorded
+ event_length, etc.).
+ """
+ data = Path(path).read_bytes()
+ off = 0
+
+ if len(data) < ELOG_BINARY_HEADER_SIZE:
+ raise AssertionError(
+ f"capture {path!s}: only {len(data)} bytes, "
+ f"need ≥{ELOG_BINARY_HEADER_SIZE} for the file header")
+ (magic, version, capture_start, start_utc_us, event_count,
+ dropped_events) = struct.unpack_from(
+ ELOG_BINARY_HEADER_FMT, data, off)
+ off += ELOG_BINARY_HEADER_SIZE
+ if magic != ELOG_BINARY_MAGIC:
+ raise AssertionError(
+ f"capture {path!s}: bad magic {magic!r} "
+ f"(expected {ELOG_BINARY_MAGIC!r})")
+ if version != ELOG_BINARY_VERSION:
+ raise AssertionError(
+ f"capture {path!s}: bad version {version} "
+ f"(expected {ELOG_BINARY_VERSION})")
+ file_hdr = ElogFileHeader(
+ capture_start=capture_start, start_utc_us=start_utc_us,
+ event_count=event_count, dropped_events=dropped_events)
+
+ if off + 4 > len(data):
+ raise AssertionError(
+ f"capture {path!s}: truncated before provider_count")
+ (prov_count,) = struct.unpack_from("<I", data, off)
+ off += 4
+ # The kernel's EVENTLOG_MAX_PROVIDERS is 32; a value above that
+ # signals a corrupt header.
+ if prov_count > 32:
+ raise AssertionError(
+ f"capture {path!s}: implausible provider_count={prov_count}")
+ providers = {}
+ for i in range(prov_count):
+ if off + ELOG_PROVIDER_INFO_SIZE > len(data):
+ raise AssertionError(
+ f"capture {path!s}: truncated provider entry {i}")
+ prov_id, name_bytes = struct.unpack_from(
+ ELOG_PROVIDER_INFO_FMT, data, off)
+ off += ELOG_PROVIDER_INFO_SIZE
+ name = name_bytes.split(b"\0", 1)[0].decode("ascii")
+ providers[prov_id] = name
+
+ events = []
+ while off < len(data):
+ if off + ELOG_EVENT_HEADER_SIZE > len(data):
+ raise AssertionError(
+ f"capture {path!s}: trailing partial event header "
+ f"at off={off} (remaining={len(data) - off})")
+ (event_length, cpu, provider_id, _reserved, timestamp,
+ session_id, event_id, thread_id) = struct.unpack_from(
+ ELOG_EVENT_HEADER_FMT, data, off)
+ if event_length < ELOG_EVENT_HEADER_SIZE:
+ raise AssertionError(
+ f"capture {path!s}: event_length {event_length} "
+ f"< header size {ELOG_EVENT_HEADER_SIZE} at off={off}")
+ if off + event_length > len(data):
+ raise AssertionError(
+ f"capture {path!s}: event at off={off} runs past EOF "
+ f"(event_length={event_length}, "
+ f"remaining={len(data) - off})")
+ payload = bytes(data[off + ELOG_EVENT_HEADER_SIZE:
+ off + event_length])
+ events.append((
+ ElogEventHeader(
+ event_length=event_length, cpu=cpu,
+ provider_id=provider_id, timestamp=timestamp,
+ session_id=session_id, event_id=event_id,
+ thread_id=thread_id),
+ payload,
+ ))
+ off += event_length
+
+ return file_hdr, providers, events
+
+
+def parse_session_create_payload(payload):
+ """SESSION_CREATE payload (from STRUCT SESSION_CREATE in
+ tcp_eventlog_schema.src):
+ void *tp; // 8 bytes on amd64, naturally aligned
+ """
+ if len(payload) != 8:
+ raise AssertionError(
+ f"SESSION_CREATE payload size {len(payload)} != 8")
+ (tp,) = struct.unpack("<Q", payload)
+ return {"tp": tp}
+
+
+def parse_conn_set_ip_v4_payload(payload):
+ """CONN_SET_IP_V4 payload (from STRUCT CONN_SET_IP_V4 in
+ tcp_eventlog_schema.src), with C natural alignment:
+ struct in_addr src_addr; // 4 bytes (network byte order)
+ uint16_t src_port; // 2 bytes (network byte order)
+ // 2 bytes padding so the next in_addr is 4-byte aligned
+ struct in_addr dst_addr; // 4 bytes
+ uint16_t dst_port; // 2 bytes
+ // 2 bytes trailing padding so sizeof rounds up to 16
+ """
+ if len(payload) != 16:
+ raise AssertionError(
+ f"CONN_SET_IP_V4 payload size {len(payload)} != 16")
+ src_addr_bytes = payload[0:4]
+ (src_port_n,) = struct.unpack_from("<H", payload, 4)
+ dst_addr_bytes = payload[8:12]
+ (dst_port_n,) = struct.unpack_from("<H", payload, 12)
+ return {
+ # in_addr stores the address in network byte order; the raw
+ # 4 bytes are exactly what inet_ntoa expects.
+ "src_addr": socket.inet_ntoa(src_addr_bytes),
+ "dst_addr": socket.inet_ntoa(dst_addr_bytes),
+ "src_port": socket.ntohs(src_port_n),
+ "dst_port": socket.ntohs(dst_port_n),
+ }
+
+
+# ---------------------------------------------------------------------------
+# CLI smoke tests (no kernel touch)
+# ---------------------------------------------------------------------------
class TestElogCli(BaseTest):
+ """Cheap smoke tests for the elog binary CLI surface. These catch
+ packaging regressions (missing binary, broken option parser) without
+ needing /dev/eventlog or the kernel framework to be functional."""
+
@pytest.mark.require_progs(["elog"])
def test_help(self):
# usage() in elog.c calls exit(1), so -h is expected to fail.
@@ -65,3 +468,676 @@
[ELOG, "-r", str(bogus)], capture_output=True, text=True)
assert r.returncode == 1
assert "bad magic number" in r.stderr
+
+
+# ---------------------------------------------------------------------------
+# Tests that touch /dev/eventlog and the tcp eventlog provider
+# ---------------------------------------------------------------------------
+
+
+class TestElogCapture(BaseTest):
+ """End-to-end coverage: elog -> /dev/eventlog -> kernel framework.
+
+ Most of these pin net.inet.tcp.functions_default to "freebsd" for
+ the duration of the test. Eventlog wiring is per-stack
+ (tfb_eventlog_provider on the function-block); pinning to a stack
+ that is known to register the tcp eventlog provider keeps the
+ tests deterministic regardless of which stack the host happens to
+ have configured by default. Cases that need a different stack
+ skip cleanly if that stack is not available on the running image
+ (sysctl_override -> pytest.skip).
+ """
+
+ NEED_ROOT = True
+
+ @pytest.mark.require_user("root")
+ def test_dev_eventlog_present(self):
+ # Sanity: /dev/eventlog should exist as a character device on any
+ # kernel with the eventlog framework compiled in.
+ assert Path(EVENTLOG_DEV).is_char_device(), (
+ f"{EVENTLOG_DEV} not a character device")
+
+ @pytest.mark.require_user("root")
+ @pytest.mark.require_progs(["elog"])
+ @pytest.mark.timeout(30)
+ def test_capture_tcp_brief(self, tmp_path):
+ """elog -c tcp subscribes, captures briefly with no traffic, and
+ exits cleanly on SIGTERM."""
+ _require_eventlog_dev()
+ capture = tmp_path / "capture.elog"
+ # No traffic; the SIGTERM handler in elog.c sets done=true and
+ # the read loop returns 0 from main().
+ with elog_capture("-c", "tcp", capture_path=capture) as elog:
+ time.sleep(1)
+ # Stats output goes to stderr in a "[Stats]\n Providers: N" block.
+ # A successful subscribe always discovers at least one provider.
+ # (The elog_capture context manager already asserted rc==0.)
+ # We don't validate stats stderr here; the capture context manager
+ # would have raised if elog died.
+ assert elog.returncode == 0
+
+ @pytest.mark.require_user("root")
+ @pytest.mark.require_progs(["elog"])
+ @pytest.mark.timeout(60)
+ @pytest.mark.parametrize(
+ "stack,family",
+ [
+ pytest.param("freebsd", socket.AF_INET, id="freebsd_inet"),
+ pytest.param("freebsd", socket.AF_INET6, id="freebsd_inet6"),
+ ],
+ )
+ def test_capture_tcp_loopback(self, tmp_path, stack, family):
+ """Drive a loopback TCP exchange under the given stack and family,
+ then verify the captured eventlog contains the events we expect."""
+ _require_eventlog_dev()
+
+ capture = tmp_path / "capture.elog"
+ port = _pick_port()
+ payload = b"ping over loopback\n"
+
+ with sysctl_override("net.inet.tcp.functions_default", stack), \
+ sysctl_override("kern.eventlog.tcp.default", "1"):
+
+ sessions_created_before = int(_sysctl_get(
+ "kern.eventlog.tcp.sessions_created"))
+
+ with elog_capture("-c", "tcp", capture_path=capture):
+ # One-shot loopback exchange. Python's socket module
+ # makes the handshake/teardown deterministic - no nc
+ # process to wait for, no sockstat polling.
+ received = loopback_oneshot(family, port, payload)
+ assert payload.rstrip(b"\n") in received
+
+ # Let the kernel push remaining events and elog drain
+ # its read (the read path's tsleep also has a 1Hz timer).
+ time.sleep(2)
+
+ sessions_created_after = int(_sysctl_get(
+ "kern.eventlog.tcp.sessions_created"))
+
+ # If no new sessions were created, downstream content assertions
+ # are meaningless.
+ assert sessions_created_after > sessions_created_before, (
+ f"no new tcp sessions during exchange "
+ f"(stack={stack}, family={family.name})")
+
+ # --- Decode and validate ---
+ decoded = decode_capture(capture)
+ assert decoded.strip(), "decoded capture is empty"
+
+ if family == socket.AF_INET6:
+ ipver_label = "IPv6 connection"
+ loop_addr = "::1"
+ else:
+ ipver_label = "IPv4 connection"
+ loop_addr = "127.0.0.1"
+
+ # Each tuple is (substring, human label). On failure the helper
+ # dumps the head of the decoded capture.
+ expectations = [
+ ("[tcp]", "tcp provider tag"),
+ ("[SESSION_CREATE]", "SESSION_CREATE event"),
+ ("[SESSION_END]", "SESSION_END event"),
+ (ipver_label, f"{ipver_label} literal"),
+ (loop_addr, f"loopback addr {loop_addr}"),
+ (str(port), f"chosen port {port}"),
+ ("Connection parameters: ISS", "CONN_PARAMS event"),
+ ("MSS set to", "MSS event"),
+ ("[IN]", "[IN] data event"),
+ ("[OUT]", "[OUT] data event"),
+ ("flags SYN", "SYN-bearing segment"),
+ ("in state ESTABLISHED", "ESTABLISHED-state IN event"),
+ ]
+ for needle, label in expectations:
+ _assert_in_decode(decoded, needle, label)
+
+ # Binary file format integrity: decoding twice must produce
+ # byte-identical output. Catches non-deterministic decode bugs
+ # (stale reader state, miscomputed offsets, etc.).
+ decoded2 = decode_capture(capture)
+ assert decoded == decoded2, "elog -r is not deterministic"
+
+ @pytest.mark.require_user("root")
+ @pytest.mark.require_progs(["elog"])
+ @pytest.mark.timeout(60)
+ def test_capture_tcp_dump_state(self, tmp_path):
+ """Validate the elog -D (DUMP_STATE) flow.
+
+ Open a tcp connection BEFORE subscribing, then run elog with -D.
+ The CREATE IOCTL invokes tcp_eventlog_dump_state inline on this
+ thread; the callback walks every TCP PCB and re-emits
+ SESSION_CREATE / CONN_SET_IP_V4 / CONN_PARAMS for each
+ enabled session. This exercises a meaningfully different code
+ path from test_capture_tcp_loopback (where the subscriber
+ attaches first and events flow live).
+ """
+ _require_eventlog_dev()
+
+ capture = tmp_path / "capture.elog"
+ port = _pick_port(salt=1)
+
+ with sysctl_override("net.inet.tcp.functions_default", "freebsd"), \
+ sysctl_override("kern.eventlog.tcp.default", "1"):
+
+ # Stand up a long-lived loopback connection FIRST so its
+ # session pre-exists when elog subscribes.
+ with loopback_pair(socket.AF_INET, port):
+ # Subscribe with -D. The dump callback fires inline.
+ with elog_capture("-c", "tcp", "-D",
+ capture_path=capture):
+ # Give elog time to drain the dumped events.
+ time.sleep(2)
+
+ decoded = decode_capture(capture)
+ assert decoded.strip(), "decoded capture is empty"
+
+ for needle, label in (
+ ("[SESSION_CREATE]", "SESSION_CREATE in dump"),
+ ("IPv4 connection", "CONN_SET_IP_V4 in dump"),
+ ("127.0.0.1", "loopback addr in dump"),
+ (str(port), f"port {port} in dump"),
+ ("Connection parameters: ISS", "CONN_PARAMS in dump"),
+ (", MSS ", "MSS field of CONN_PARAMS in dump"),
+ ):
+ _assert_in_decode(decoded, needle, label)
+
+ @pytest.mark.require_user("root")
+ @pytest.mark.require_progs(["elog"])
+ @pytest.mark.timeout(60)
+ def test_capture_tcp_mass_enable_disable(self, tmp_path):
+ """Validate kern.eventlog.tcp.default=2 / -1 transitions.
+
+ Writing 2 means "enable all currently-disabled sessions"; -1
+ means "disable all currently-active sessions". Plain 0/1
+ transitions only affect future sessions and are covered by
+ test_capture_tcp_loopback. The 2/-1 path runs through
+ tcp_eventlog_default_changed and eventlog_provider_set_all_sessions
+ and otherwise has no end-to-end coverage.
+
+ Strategy: open a long-lived nc-equivalent pair under default=0
+ (sessions come up disabled), snapshot sessions_enabled, flip
+ default=2 and assert it rose, drive a side connection so we can
+ confirm events actually flow under default=2, flip default=-1
+ and assert sessions_enabled fell. Counter deltas are robust
+ against background tcp activity on the test image.
+ """
+ _require_eventlog_dev()
+
+ capture = tmp_path / "capture.elog"
+ port = _pick_port(salt=2)
+ side_port = _pick_port(salt=3)
+
+ with sysctl_override("net.inet.tcp.functions_default", "freebsd"), \
+ sysctl_override("kern.eventlog.tcp.default", "0"):
+
+ with elog_capture("-c", "tcp", capture_path=capture):
+ # Long-lived holding pair. Sessions come up disabled
+ # because default=0 at the moment they're created.
+ with loopback_pair(socket.AF_INET, port):
+ enabled_before = int(_sysctl_get(
+ "kern.eventlog.tcp.sessions_enabled"))
+
+ # === Phase 1: mass enable ===
+ _sysctl_set("kern.eventlog.tcp.default", "2")
+ enabled_after_enable = int(_sysctl_get(
+ "kern.eventlog.tcp.sessions_enabled"))
+ assert enabled_after_enable > enabled_before, (
+ f"default=2 did not raise sessions_enabled: "
+ f"{enabled_before} -> {enabled_after_enable}")
+
+ # Drive a side exchange while logging is on so the
+ # capture has observable events for that connection.
+ loopback_oneshot(socket.AF_INET, side_port,
+ b"mass-enable phase\n")
+ time.sleep(1)
+
+ # === Phase 2: mass disable ===
+ _sysctl_set("kern.eventlog.tcp.default", "-1")
+ enabled_after_disable = int(_sysctl_get(
+ "kern.eventlog.tcp.sessions_enabled"))
+ assert enabled_after_disable < enabled_after_enable, (
+ f"default=-1 did not lower sessions_enabled: "
+ f"{enabled_after_enable} -> {enabled_after_disable}")
+
+ # Brief drain before the holding pair tears down.
+ time.sleep(1)
+
+ decoded = decode_capture(capture)
+ assert decoded.strip(), "decoded capture is empty"
+
+ # The phase-1 side connection was opened *while* default=2 was in
+ # effect, so its sessions started enabled and emitted normal
+ # events. We expect SESSION_CREATE plus IN/OUT for that port.
+ for needle, label in (
+ (str(side_port), f"side connection port {side_port}"),
+ ("[SESSION_CREATE]", "SESSION_CREATE in capture"),
+ ("IPv4 connection", "CONN_SET_IP_V4 in capture"),
+ ("[IN]", "[IN] events in capture"),
+ ("[OUT]", "[OUT] events in capture"),
+ ):
+ _assert_in_decode(decoded, needle, label)
+
+ @pytest.mark.require_user("root")
+ @pytest.mark.require_progs(["elog"])
+ @pytest.mark.timeout(60)
+ @pytest.mark.parametrize(
+ "keyword,expected_present,expected_absent",
+ [
+ pytest.param("RX", "[IN]", "[OUT]", id="RX_only"),
+ pytest.param("TX", "[OUT]", "[IN]", id="TX_only"),
+ ],
+ )
+ def test_capture_tcp_keyword_filter(
+ self, tmp_path, keyword, expected_present, expected_absent):
+ """Validate `-c tcp <level> <keyword>` keyword filtering.
+
+ IN events are tagged with the RX keyword and OUT events with
+ TX (see tcp_eventlog_schema.src). A subscription that
+ specifies only RX must produce IN events but no OUT events,
+ and vice versa. SESSION_CREATE/SESSION_END are tagged with
+ the framework-reserved SESSION keyword which elog(1)
+ unconditionally OR's into every subscription, so those still
+ appear regardless of the user-specified keyword mask.
+ """
+ _require_eventlog_dev()
+
+ capture = tmp_path / "capture.elog"
+ port = _pick_port(salt=20)
+
+ with sysctl_override("net.inet.tcp.functions_default", "freebsd"), \
+ sysctl_override("kern.eventlog.tcp.default", "1"):
+ with elog_capture("-c", "tcp", "VERBOSE", keyword,
+ capture_path=capture):
+ loopback_oneshot(socket.AF_INET, port,
+ b"keyword-filter\n")
+ time.sleep(2)
+
+ decoded = decode_capture(capture)
+ assert decoded.strip(), "decoded capture is empty"
+
+ _assert_in_decode(
+ decoded, expected_present,
+ f"keyword={keyword}: {expected_present} should be present")
+ if expected_absent in decoded:
+ pytest.fail(
+ f"keyword={keyword}: {expected_absent!r} should be "
+ f"filtered out but is present in capture\n"
+ f"--- decoded.txt (first 2000 chars) ---\n"
+ f"{decoded[:2000]}\n--- end ---")
+
+ @pytest.mark.require_user("root")
+ @pytest.mark.require_progs(["elog"])
+ @pytest.mark.timeout(60)
+ def test_capture_tcp_level_filter(self, tmp_path):
+ """Validate `-c tcp <level>` level filtering.
+
+ Subscribing at ERROR (level=1) requests only events at level
+ ERROR or below. All TCP events on a clean exchange are at
+ INFO/VERBOSE/TRACE level (see tcp_eventlog_schema.src), so the
+ capture must contain none of the normal lifecycle events.
+ Compared to a TRACE subscription over the same workload,
+ which produces dozens of events.
+
+ Note: when the level filter drops every event the capture
+ file is 0 bytes (elog only writes the binary header lazily,
+ on the first event). decode_capture_or_empty handles that.
+ """
+ _require_eventlog_dev()
+
+ port_low = _pick_port(salt=21)
+ port_high = _pick_port(salt=22)
+
+ capture_low = tmp_path / "capture-error.elog"
+ capture_high = tmp_path / "capture-trace.elog"
+
+ with sysctl_override("net.inet.tcp.functions_default", "freebsd"), \
+ sysctl_override("kern.eventlog.tcp.default", "1"):
+
+ # Pass A: ERROR level - clean traffic should produce no
+ # tcp lifecycle/data events at this filter level.
+ with elog_capture("-c", "tcp", "ERROR",
+ capture_path=capture_low):
+ loopback_oneshot(socket.AF_INET, port_low,
+ b"level-error\n")
+ time.sleep(2)
+
+ # Pass B: TRACE level - same traffic, full event stream.
+ with elog_capture("-c", "tcp", "TRACE",
+ capture_path=capture_high):
+ loopback_oneshot(socket.AF_INET, port_high,
+ b"level-trace\n")
+ time.sleep(2)
+
+ decoded_low = decode_capture_or_empty(capture_low)
+ decoded_high = decode_capture(capture_high)
+
+ # ERROR pass must not contain any of the normal INFO/VERBOSE
+ # events. We test for [IN] and [OUT] (VERBOSE) plus
+ # CONN_SET_IP_V4's literal output (INFO). All three are absent
+ # at ERROR level. (Empty capture trivially satisfies this.)
+ for needle in ("[IN]", "[OUT]", "IPv4 connection"):
+ if needle in decoded_low:
+ pytest.fail(
+ f"level=ERROR capture should not contain "
+ f"{needle!r} but does\n"
+ f"--- decoded-error.txt (first 2000 chars) ---\n"
+ f"{decoded_low[:2000]}\n--- end ---")
+
+ # TRACE pass must contain the full event stream for the same
+ # workload. This proves the level filter actually filters
+ # (rather than the test's traffic being silently broken).
+ for needle, label in (
+ ("[IN]", "[IN] in TRACE capture"),
+ ("[OUT]", "[OUT] in TRACE capture"),
+ ("IPv4 connection", "CONN_SET_IP_V4 in TRACE capture"),
+ ):
+ _assert_in_decode(decoded_high, needle, label)
+
+ @pytest.mark.require_user("root")
+ @pytest.mark.require_progs(["elog"])
+ @pytest.mark.timeout(60)
+ def test_capture_per_session_output_dir(self, tmp_path):
+ """Validate `-o dir=PATH -f tcp` per-session output mode.
+
+ elog supports splitting captured events into one binary file
+ per session, named by TCP four-tuple when -f tcp is set:
+ <log_id>_<local_port>_<remote_ip>_<remote_port>.elog
+ This is the operational mode used to debug a single
+ connection in the field.
+
+ Drive two distinct loopback connections, then assert:
+ 1) the output dir is created
+ 2) at least one per-session file is created for each
+ connection (matching the chosen port)
+ 3) every per-session file decodes successfully and contains
+ a SESSION_CREATE event for its session
+ """
+ _require_eventlog_dev()
+
+ out_dir = tmp_path / "split"
+ port_a = _pick_port(salt=30)
+ port_b = _pick_port(salt=31)
+
+ with sysctl_override("net.inet.tcp.functions_default", "freebsd"), \
+ sysctl_override("kern.eventlog.tcp.default", "1"):
+ with elog_capture("-c", "tcp", "-f", "tcp",
+ "-o", f"dir={out_dir}"):
+ # Two distinct connections so we can assert both
+ # ports appear in the per-session filenames.
+ for port in (port_a, port_b):
+ loopback_oneshot(socket.AF_INET, port,
+ b"per-session\n")
+ time.sleep(2)
+
+ assert out_dir.is_dir(), \
+ f"elog did not create output dir {out_dir}"
+
+ files = sorted(out_dir.glob("*.elog"))
+ assert files, f"no per-session files under {out_dir}"
+
+ names = [f.name for f in files]
+ for port in (port_a, port_b):
+ matches = [n for n in names if str(port) in n]
+ assert matches, (
+ f"no per-session file mentions port {port}; "
+ f"got: {names}")
+
+ # Every per-session file must be a valid binary capture and
+ # contain a SESSION_CREATE event for the session it represents.
+ for path in files:
+ decoded = decode_capture(path)
+ _assert_in_decode(
+ decoded, "[SESSION_CREATE]",
+ f"{path.name}: SESSION_CREATE in per-session file")
+
+ @pytest.mark.require_user("root")
+ @pytest.mark.require_progs(["elog"])
+ @pytest.mark.timeout(60)
+ def test_sessions_active_no_leak(self):
+ """Open and close N TCP loopback pairs; sessions_active must
+ return to its baseline. Catches PCB-side eventlog session
+ lifecycle leaks (a destroyed PCB whose session record never
+ gets freed would inflate sessions_active monotonically).
+
+ We use SO_LINGER {1, 0} on the client so close() emits a TCP
+ RST and the kernel reclaims both PCBs immediately, avoiding
+ TIME_WAIT and keeping the test runtime bounded.
+ """
+ _require_eventlog_dev()
+
+ N = 10
+
+ with sysctl_override("net.inet.tcp.functions_default", "freebsd"), \
+ sysctl_override("kern.eventlog.tcp.default", "1"):
+
+ # Settle for a moment then sample baseline. The test image
+ # has sshd, possibly other tcp sessions; we tolerate small
+ # drift.
+ time.sleep(1)
+ baseline = int(_sysctl_get(
+ "kern.eventlog.tcp.sessions_active"))
+
+ # Each iteration creates exactly two enabled tcp sessions
+ # (one per PCB) and immediately destroys both via RST.
+ for i in range(N):
+ with loopback_pair(socket.AF_INET,
+ _pick_port(salt=40 + i),
+ linger0=True):
+ pass
+
+ # Give the kernel a moment to reclaim PCBs and drain any
+ # SESSION_END processing.
+ time.sleep(2)
+
+ final = int(_sysctl_get(
+ "kern.eventlog.tcp.sessions_active"))
+
+ delta = final - baseline
+ # A real leak shows up as delta >= 2*N (one session per PCB,
+ # two PCBs per pair). Tolerate small ambient growth (e.g.,
+ # the test runner's own ssh shell churning) up to N - this
+ # is well below the 2*N a leak would produce.
+ assert delta < N, (
+ f"tcp eventlog sessions_active leak: "
+ f"baseline={baseline} -> final={final} "
+ f"(delta={delta}, opened={N} pairs)")
+
+ @pytest.mark.require_user("root")
+ @pytest.mark.require_progs(["elog"])
+ @pytest.mark.timeout(60)
+ def test_capture_tcp_binary_format(self, tmp_path):
+ """Validate the on-disk binary capture format end-to-end by
+ decoding it directly in Python (no `elog -r` round-trip).
+
+ `elog -r` is forgiving about its own output: it could trade
+ layouts back and forth without anyone noticing. Real
+ consumers of the binary stream - external collectors, custom
+ analyzers, off-host tooling - aren't, and any change to the
+ record layout, header struct, or framework-emitted events is
+ an ABI change for them. Parsing the file with a hand-written
+ Python decoder catches those changes immediately.
+
+ Asserts on the binary file directly:
+ - elog_binary_header magic / version / start_utc_us / counts
+ are sane and self-consistent (event_count matches the
+ number of records actually walked, dropped_events=0).
+ - Provider map contains the tcp provider id.
+ - Each event header has provider_id matching the tcp
+ provider, event_length is plausible (>= header,
+ <= a generous max), and the recorded payload size
+ agrees with event_length - sizeof(header).
+ - Per-session timestamps are monotonically non-decreasing
+ within the same session_id.
+ - Loopback creates two PCBs (client + server) so we expect
+ >= 2 distinct session_ids that each transit
+ SESSION_CREATE -> ... -> SESSION_END, with at least one
+ CONN_SET_IP_V4, MSS, and one of IN/OUT in between.
+ - SESSION_CREATE payload is exactly 8 bytes (void *tp)
+ and tp is non-zero (real PCB pointer from the kernel).
+ - CONN_SET_IP_V4 payload is exactly 16 bytes and decodes
+ to (127.0.0.1, port, 127.0.0.1, port) for the loopback
+ four-tuple, with our chosen port present in at least
+ one of the captured CONN_SET_IP_V4 events.
+ """
+ _require_eventlog_dev()
+
+ capture = tmp_path / "capture.elog"
+ port = _pick_port(salt=50)
+ payload = b"binary-format\n"
+
+ with sysctl_override("net.inet.tcp.functions_default", "freebsd"), \
+ sysctl_override("kern.eventlog.tcp.default", "1"):
+ with elog_capture("-c", "tcp", capture_path=capture):
+ received = loopback_oneshot(socket.AF_INET, port, payload)
+ assert payload.rstrip(b"\n") in received
+ # Wait long enough for the kernel to push everything
+ # and elog to drain its read; otherwise event_count
+ # in the file header may be < the events we expect.
+ time.sleep(2)
+
+ file_hdr, providers, events = parse_elog_binary(capture)
+
+ # File-header invariants. event_count is updated in-place at
+ # close (update_binary_header in elog.c), so it must equal
+ # what we walked.
+ assert file_hdr.event_count == len(events), (
+ f"event_count in header ({file_hdr.event_count}) != "
+ f"events walked ({len(events)})")
+ assert file_hdr.dropped_events == 0, (
+ f"kernel reported {file_hdr.dropped_events} dropped events "
+ f"during the capture")
+ assert file_hdr.capture_start > 0, (
+ "capture_start should be the boot-relative timestamp at "
+ "capture start; got 0")
+ assert file_hdr.start_utc_us > 0, (
+ "start_utc_us should be the wall-clock UTC microseconds "
+ "at capture start; got 0")
+
+ # Provider map invariants: at least one entry named "tcp".
+ # Each TCP function block (freebsd, rack, ngen_*) registers its
+ # own eventlog_provider and the kernel's name-based subscribe
+ # path matches every provider whose name is "tcp", so a single
+ # `-c tcp` subscription can pull in multiple provider ids on
+ # an image that builds more than one stack. The invariant is
+ # that *every* event in the capture came from one of those tcp
+ # providers, not that there is exactly one of them.
+ tcp_ids = {pid for pid, name in providers.items()
+ if name == "tcp"}
+ assert tcp_ids, (
+ f"no tcp provider in capture; got {providers!r}")
+
+ # Per-event invariants and per-session bookkeeping. We check
+ # invariants on EVERY event in the file but only build the
+ # detailed per-session event-id map: VMs always have some
+ # background TCP activity, and per-session content assertions
+ # only make sense for the sessions we created ourselves
+ # (identified below by matching the CONN_SET_IP_V4 four-tuple
+ # against `port`).
+ assert events, "no events recorded in capture"
+ per_session_last_ts = {}
+ per_session_event_ids = {}
+ for hdr, ev_payload in events:
+ assert hdr.provider_id in tcp_ids, (
+ f"unexpected provider_id={hdr.provider_id} in event "
+ f"(tcp provider ids: {sorted(tcp_ids)}; "
+ f"providers: {providers!r})")
+ # event_length self-consistency.
+ assert (hdr.event_length ==
+ ELOG_EVENT_HEADER_SIZE + len(ev_payload)), (
+ f"event_length {hdr.event_length} != header "
+ f"({ELOG_EVENT_HEADER_SIZE}) + payload "
+ f"({len(ev_payload)})")
+ # Sanity: no individual event should be huge. The kernel
+ # caps payloads at a few hundred bytes; 4 KiB is a
+ # generous upper bound that still flags corruption.
+ assert hdr.event_length <= 4096, (
+ f"event_length {hdr.event_length} suspiciously large")
+ # Timestamp monotonicity within a session_id. Different
+ # sessions can interleave (different CPUs), so we don't
+ # check globally - only within a session.
+ last = per_session_last_ts.get(hdr.session_id)
+ if last is not None:
+ assert hdr.timestamp >= last, (
+ f"timestamp regression in session "
+ f"{hdr.session_id}: {last} -> {hdr.timestamp}")
+ per_session_last_ts[hdr.session_id] = hdr.timestamp
+ per_session_event_ids.setdefault(
+ hdr.session_id, []).append(hdr.event_id)
+
+ # Identify *our* loopback sessions: the ones whose
+ # CONN_SET_IP_V4 four-tuple touches `port`. The VM has
+ # incidental TCP activity (e.g. a kernel-internal session
+ # that may transit only SESSION_CREATE/SESSION_END), so we
+ # can't assume that every session in the capture belongs to
+ # this test.
+ v4_events = [
+ (h, p) for h, p in events
+ if h.event_id == TCP_EVENT_CONN_SET_IP_V4]
+ assert v4_events, "no CONN_SET_IP_V4 events to validate"
+
+ our_sessions = set()
+ seen_ports = set()
+ for hdr, ev_payload in v4_events:
+ info = parse_conn_set_ip_v4_payload(ev_payload)
+ seen_ports.add(info["src_port"])
+ seen_ports.add(info["dst_port"])
+ if info["src_port"] == port or info["dst_port"] == port:
+ # Validate the CONN_SET_IP_V4 payload structure for
+ # connections we know are ours: addresses must be
+ # loopback. (We don't enforce 127.0.0.1 on every
+ # CONN_SET_IP_V4 in the capture; an unrelated kernel
+ # session could legitimately have a different IP.)
+ assert info["src_addr"] == "127.0.0.1", (
+ f"CONN_SET_IP_V4 src {info['src_addr']!r} != "
+ f"127.0.0.1 in our session {hdr.session_id}")
+ assert info["dst_addr"] == "127.0.0.1", (
+ f"CONN_SET_IP_V4 dst {info['dst_addr']!r} != "
+ f"127.0.0.1 in our session {hdr.session_id}")
+ our_sessions.add(hdr.session_id)
+ assert port in seen_ports, (
+ f"chosen port {port} not present in any CONN_SET_IP_V4 "
+ f"four-tuple (saw ports: {sorted(seen_ports)})")
+ # Loopback creates two PCBs (client + server), each with its
+ # own eventlog session, so we expect exactly 2 of "ours". We
+ # accept >= 2 to remain robust against unforeseen retries
+ # while still catching a "we lost a side" regression.
+ assert len(our_sessions) >= 2, (
+ f"expected ≥2 loopback sessions touching port {port}, "
+ f"got {len(our_sessions)}: {sorted(our_sessions)}")
+
+ # For each session WE created, validate the full lifecycle
+ # is present in the binary record stream.
+ for sid in sorted(our_sessions):
+ ids = per_session_event_ids[sid]
+ assert EVENTLOG_SESSION_CREATE_ID in ids, (
+ f"session {sid}: no SESSION_CREATE event "
+ f"(event_ids: {sorted(set(ids))})")
+ assert EVENTLOG_SESSION_END_ID in ids, (
+ f"session {sid}: no SESSION_END event "
+ f"(event_ids: {sorted(set(ids))})")
+ assert TCP_EVENT_CONN_SET_IP_V4 in ids, (
+ f"session {sid}: no CONN_SET_IP_V4 event "
+ f"(event_ids: {sorted(set(ids))})")
+ assert TCP_EVENT_MSS in ids, (
+ f"session {sid}: no MSS event "
+ f"(event_ids: {sorted(set(ids))})")
+ assert (TCP_EVENT_IN in ids) or (TCP_EVENT_OUT in ids), (
+ f"session {sid}: no IN or OUT data event "
+ f"(event_ids: {sorted(set(ids))})")
+
+ # Validate the SESSION_CREATE payload structure for the
+ # SESSION_CREATE events of our loopback sessions: 8-byte tp
+ # pointer that the kernel passes from
+ # tcp_eventlog_session_init must be non-zero.
+ our_sc_events = [
+ (h, p) for h, p in events
+ if (h.event_id == EVENTLOG_SESSION_CREATE_ID
+ and h.session_id in our_sessions)]
+ assert len(our_sc_events) >= 2, (
+ f"expected ≥2 SESSION_CREATE events for our loopback "
+ f"sessions, got {len(our_sc_events)}")
+ for hdr, ev_payload in our_sc_events:
+ sc = parse_session_create_payload(ev_payload)
+ assert sc["tp"] != 0, (
+ f"SESSION_CREATE for session {hdr.session_id}: "
+ f"tp pointer is 0 (kernel did not record tcpcb)")
diff --git a/usr.bin/elog/elog.1 b/usr.bin/elog/elog.1
--- a/usr.bin/elog/elog.1
+++ b/usr.bin/elog/elog.1
@@ -65,7 +65,7 @@
Subscribe to events from the specified provider.
.Bl -tag -width indent
.It Ar provider
-The name of the eventlog provider.
+The name of the eventlog provider (e.g., "tcp").
This argument is required.
.It Ar level
Optional log level.
@@ -106,6 +106,18 @@
.It Fl -event-name
Show the event name (e.g., IN, OUT, SESSION_CREATE) in square brackets
after the session ID in each output line.
+.It Fl f Ar type
+.It Fl -format Ar type
+Rename per-session output files with connection metadata.
+The only supported
+.Ar type
+is
+.Cm tcp ,
+which renames files with TCP connection metadata.
+Files are named using the format
+.Ar log_id Ns _ Ns Ar local_port Ns _ Ns Ar remote_ip Ns _ Ns Ar remote_port Ns .elog .
+Requires
+.Fl o Cm dir= Ns Ar path .
.It Fl n
.It Fl -event-number
Print a serial event number at the beginning of each output line.
@@ -121,7 +133,9 @@
.It Fl o Cm dir= Ns Ar path
Write one binary file per session under
.Ar path .
-Each session gets its own file, named by session ID.
+Each session gets its own file, named by session ID (or by TCP metadata
+when combined with
+.Fl f Cm tcp ) .
The directory is created if it does not exist.
.It Fl p
.It Fl -providers
@@ -261,49 +275,49 @@
.Sh EXIT STATUS
.Ex -std
.Sh EXAMPLES
-Subscribe to all events from a provider at VERBOSE level with all keywords:
+Subscribe to all TCP events at VERBOSE level with all keywords:
.Bd -literal -offset indent
-elog -c provider
+elog -c tcp
.Ed
.Pp
-Subscribe to a provider at INFO level with all keywords:
+Subscribe to TCP events at INFO level with all keywords:
.Bd -literal -offset indent
-elog -c provider INFO
+elog -c tcp INFO
.Ed
.Pp
-Subscribe to a provider at INFO level with specific keywords by name:
+Subscribe to TCP events at INFO level with specific keywords by name:
.Bd -literal -offset indent
-elog -c provider INFO KEYWORD1|KEYWORD2
+elog -c tcp INFO CC|RX|TX|PACING|CONN
.Ed
.Pp
-Subscribe to a provider at INFO level with keywords as hex mask:
+Subscribe to TCP events at INFO level with keywords as hex mask:
.Bd -literal -offset indent
-elog -c provider INFO 0x3F
+elog -c tcp INFO 0x3F
.Ed
.Pp
-Subscribe to a provider at TRACE level:
+Subscribe to all TCP events including TRACE-level detail:
.Bd -literal -offset indent
-elog -c provider TRACE
+elog -c tcp TRACE
.Ed
.Pp
Subscribe to multiple providers:
.Bd -literal -offset indent
-elog -c provider1 INFO -c provider2 WARN
+elog -c tcp INFO -c provider WARN
.Ed
.Pp
Subscribe using numeric levels:
.Bd -literal -offset indent
-elog -c provider 3 0x3F
+elog -c tcp 3 0x3F
.Ed
.Pp
Set buffer size to 4MB per CPU:
.Bd -literal -offset indent
-elog -b 4M -c provider
+elog -b 4M -c tcp
.Ed
.Pp
Write output to a file (binary format):
.Bd -literal -offset indent
-elog -c provider -o /tmp/events.bin
+elog -c tcp -o /tmp/events.bin
.Ed
.Pp
Read binary file and convert to text:
@@ -313,27 +327,27 @@
.Pp
Show relative timestamps from trace start:
.Bd -literal -offset indent
-elog -t -c provider
+elog -t -c tcp
.Ed
.Pp
Show both relative and inter-event delta timestamps:
.Bd -literal -offset indent
-elog -t --delta-time -c provider
+elog -t --delta-time -c tcp
.Ed
.Pp
-Write per-session files into a directory:
+Write per-session files with TCP metadata naming:
.Bd -literal -offset indent
-elog -o dir=/tmp/traces -c provider
+elog -o dir=/tmp/traces -f tcp -c tcp
.Ed
.Pp
Print statistics on exit:
.Bd -literal -offset indent
-elog -s -c provider
+elog -s -c tcp
.Ed
.Pp
Run a fixed-duration capture, suitable for scripted use:
.Bd -literal -offset indent
-elog --duration 30 -c provider1 -c provider2 -o /tmp/cap.elog
+elog --duration 30 -c tcp -o /tmp/cap.elog
.Ed
.Sh SEE ALSO
.Xr elog 5 ,
diff --git a/usr.bin/elog/elog.c b/usr.bin/elog/elog.c
--- a/usr.bin/elog/elog.c
+++ b/usr.bin/elog/elog.c
@@ -26,6 +26,9 @@
#include <sys/time.h>
#include <time.h>
#include <signal.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
#include <zlib.h>
/* Include consumer header for formatting events */
@@ -65,6 +68,8 @@
static uint64_t prev_event_ts = 0;
static bool dump_state = false; /* Replay current state on subscribe */
static char *output_dir = NULL; /* If set, one file per session */
+/* Rename per-session files using TCP connection metadata */
+static bool format_tcp = false;
/* Per-session file state for -o dir= mode */
struct session_file {
@@ -76,6 +81,13 @@
uint64_t capture_start;
uint64_t start_utc_us;
uint64_t event_count;
+ /* TCP connection metadata for file rename (-f tcp). */
+ char log_id[64];
+ char remote_ip[INET6_ADDRSTRLEN];
+ uint16_t local_port;
+ uint16_t remote_port;
+ bool has_log_id;
+ bool has_conn_info;
};
static STAILQ_HEAD(, session_file) session_files =
STAILQ_HEAD_INITIALIZER(session_files);
@@ -150,6 +162,8 @@
" -s, --stats Print detailed statistics on exit\n"
" -o, --output <file> Write binary output to file (default: stdout)\n"
" -o dir=<path> Write one binary file per session under <path>\n"
+" -f, --format <type> Rename per-session files using connection metadata\n"
+" Supported types: tcp\n"
" -r, --read-binary <file>\n"
" Read binary file and convert to text (.gz ok)\n"
" -t, --relative-time Show time relative to first event\n"
@@ -376,9 +390,9 @@
if (j == 0)
snprintf(sanitized, sizeof(sanitized), "global");
- sf = malloc(sizeof(*sf));
+ sf = calloc(1, sizeof(*sf));
if (sf == NULL)
- err(1, "malloc(session_file)");
+ err(1, "calloc(session_file)");
sf->session_id = strdup(session_id);
if (sf->session_id == NULL)
err(1, "strdup");
@@ -391,9 +405,6 @@
sf->fp = fopen(fullpath, "wb");
if (sf->fp == NULL)
err(1, "fopen(%s)", fullpath);
- sf->header_written = false;
- sf->capture_start = 0;
- sf->event_count = 0;
STAILQ_INSERT_TAIL(&session_files, sf, link);
return (sf->fp);
}
@@ -438,6 +449,107 @@
err(1, "fwrite(binary header)");
}
+/*
+ * Extract TCP connection metadata from CONN_SET_IP_V[46] / LOG_ID
+ * events for the file-rename feature (-f tcp). Only the first log_id
+ * and first connection address seen for a session are captured.
+ */
+static void
+extract_tcp_metadata(struct session_file *sf,
+ const struct eventlog_event_header *hdr,
+ const void *payload, size_t payload_size)
+{
+ if (sf == NULL || (sf->has_log_id && sf->has_conn_info) ||
+ strcmp(get_provider_name(hdr->provider_id), "tcp") != 0)
+ return;
+
+ switch (hdr->event_id) {
+ case TCP_EVENTLOG_CONN_SET_IP_V4_ID: {
+ const struct tcp_eventlog_conn_set_ip_v4 *evt;
+ if (sf->has_conn_info)
+ break;
+ if (payload_size < sizeof(*evt))
+ break;
+ evt = (const struct tcp_eventlog_conn_set_ip_v4 *)payload;
+ sf->local_port = ntohs(evt->src_port);
+ sf->remote_port = ntohs(evt->dst_port);
+ inet_ntop(AF_INET, &evt->dst_addr, sf->remote_ip,
+ sizeof(sf->remote_ip));
+ sf->has_conn_info = true;
+ break;
+ }
+ case TCP_EVENTLOG_CONN_SET_IP_V6_ID: {
+ const struct tcp_eventlog_conn_set_ip_v6 *evt;
+ if (sf->has_conn_info)
+ break;
+ if (payload_size < sizeof(*evt))
+ break;
+ evt = (const struct tcp_eventlog_conn_set_ip_v6 *)payload;
+ sf->local_port = ntohs(evt->src_port);
+ sf->remote_port = ntohs(evt->dst_port);
+ inet_ntop(AF_INET6, &evt->dst_addr, sf->remote_ip,
+ sizeof(sf->remote_ip));
+ sf->has_conn_info = true;
+ break;
+ }
+ case TCP_EVENTLOG_LOG_ID_ID: {
+ const struct tcp_eventlog_log_id *evt;
+ if (sf->has_log_id)
+ break;
+ if (payload_size < sizeof(*evt))
+ break;
+ evt = (const struct tcp_eventlog_log_id *)payload;
+ if (evt->log_id[0] != '\0') {
+ strlcpy(sf->log_id, evt->log_id, sizeof(sf->log_id));
+ sf->has_log_id = true;
+ }
+ break;
+ }
+ }
+}
+
+/*
+ * Rename a session file based on captured TCP metadata.
+ * Format: <log_id>_<local_port>_<remote_ip>_<remote_port>.elog
+ * Missing fields are replaced with "unknown". If no metadata at all,
+ * skip the rename.
+ */
+static void
+rename_session_file(struct session_file *sf)
+{
+ char newpath[PATH_MAX];
+ const char *log_id_str, *remote_ip_str;
+ char local_port_str[8], remote_port_str[8];
+
+ if (!format_tcp || sf == NULL || sf->filepath == NULL ||
+ output_dir == NULL)
+ return;
+ if (!sf->has_log_id && !sf->has_conn_info)
+ return;
+
+ log_id_str = sf->has_log_id ? sf->log_id : "unknown";
+ remote_ip_str = sf->has_conn_info ? sf->remote_ip : "unknown";
+ if (sf->has_conn_info) {
+ snprintf(local_port_str, sizeof(local_port_str), "%u",
+ sf->local_port);
+ snprintf(remote_port_str, sizeof(remote_port_str), "%u",
+ sf->remote_port);
+ } else {
+ strlcpy(local_port_str, "unknown", sizeof(local_port_str));
+ strlcpy(remote_port_str, "unknown", sizeof(remote_port_str));
+ }
+
+ if (snprintf(newpath, sizeof(newpath), "%s/%s_%s_%s_%s.elog",
+ output_dir, log_id_str, local_port_str, remote_ip_str,
+ remote_port_str) >= (int)sizeof(newpath))
+ return;
+
+ if (rename(sf->filepath, newpath) == 0) {
+ free(sf->filepath);
+ sf->filepath = strdup(newpath);
+ }
+}
+
static void
close_session_file(const char *session_id)
{
@@ -451,6 +563,7 @@
fflush(sf->fp);
fclose(sf->fp);
sf->fp = NULL;
+ rename_session_file(sf);
STAILQ_REMOVE(&session_files, sf, session_file, link);
free(sf->filepath);
free(sf->session_id);
@@ -491,7 +604,8 @@
struct session_file *sf;
size_t event_length;
- snprintf(session_id_str, sizeof(session_id_str), "%lu",
+ snprintf(session_id_str, sizeof(session_id_str), "%s_%lu",
+ get_provider_name(hdr->provider_id),
(unsigned long)hdr->session_id);
if (output_dir != NULL) {
@@ -515,6 +629,8 @@
sf->start_utc_us = utc_us;
}
sf->event_count++;
+ if (format_tcp)
+ extract_tcp_metadata(sf, hdr, payload, payload_size);
}
event_length = sizeof(struct eventlog_event_header) + payload_size;
@@ -1013,6 +1129,7 @@
{ "event-number", no_argument, NULL, 'n' },
{ "providers", no_argument, NULL, 'p' },
{ "stats", no_argument, NULL, 's' },
+ { "format", required_argument, NULL, 'f' },
{ "output", required_argument, NULL, 'o' },
{ "read-binary", required_argument, NULL, 'r' },
{ "relative-time", no_argument, NULL, 't' },
@@ -1028,7 +1145,7 @@
* [keywords]" can consume the operands that follow it; getopt_long
* still allows clustered flags such as "-enr file.elog".
*/
- while ((ch = getopt_long(argc, argv, "+b:c:dDehno:pr:st", longopts,
+ while ((ch = getopt_long(argc, argv, "+b:c:dDef:hno:pr:st", longopts,
NULL)) != -1) {
switch (ch) {
case 'c':
@@ -1090,6 +1207,11 @@
case 'D':
dump_state = true;
break;
+ case 'f':
+ if (strcmp(optarg, "tcp") != 0)
+ errx(1, "--format requires 'tcp' argument");
+ format_tcp = true;
+ break;
case 'h':
usage();
break;
@@ -1231,6 +1353,7 @@
fflush(sf->fp);
fclose(sf->fp);
sf->fp = NULL;
+ rename_session_file(sf);
}
free(sf->filepath);
free(sf->session_id);

File Metadata

Mime Type
text/plain
Expires
Sun, Jun 21, 2:32 AM (6 h, 54 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
34136693
Default Alt Text
D57309.id179809.diff (118 KB)

Event Timeline