Page MenuHomeFreeBSD

D56979.diff
No OneTemporary

D56979.diff

This file is larger than 256 KB, so syntax highlighting was skipped.
diff --git a/include/eventlog/eventlog_gen.awk b/include/eventlog/eventlog_gen.awk
new file mode 100644
--- /dev/null
+++ b/include/eventlog/eventlog_gen.awk
@@ -0,0 +1,1792 @@
+#!/usr/bin/awk -f
+
+#
+# Copyright (c) 2026 Netflix, Inc.
+#
+# SPDX-License-Identifier: BSD-2-Clause
+#
+# Script to generate event log header file from schema file.
+#
+# usage: eventlog_gen.awk <schema.src> -h (generate producer header for kernel)
+# eventlog_gen.awk <schema.src> -c (generate consumer header for userland)
+
+function usage()
+{
+ print "usage: eventlog_gen.awk <schema.src> -h|-c";
+ print " -h Generate producer header (for kernel modules generating events)";
+ print " -c Generate consumer header (for userland tools consuming events)";
+ exit 1;
+}
+
+function die(msg, what)
+{
+ printf srcfile "(" fnr "): " > "/dev/stderr";
+ printf msg "\n", what > "/dev/stderr";
+ exit 1;
+}
+
+function printh(s) {
+ # Ensure output directory exists (only create once)
+ if (hfile != "" && !dir_created) {
+ # Extract directory from hfile path
+ # CRITICAL: Always use absolute paths to prevent creating dirs in source tree
+ if (hfile ~ /^\//) {
+ # Absolute path - use dirname approach
+ # Remove filename to get directory
+ dir_path = hfile;
+ sub(/\/[^\/]*$/, "", dir_path);
+ } else {
+ # Relative path - this should not happen if outdir is set correctly
+ # But handle it by making it absolute relative to current directory
+ # Get current directory using getline
+ "pwd" | getline cwd;
+ close("pwd");
+ split(hfile, parts, "/");
+ dir_path = cwd;
+ for (i = 1; i < length(parts); i++) {
+ if (parts[i] != "" && parts[i] != ".") {
+ if (parts[i] == "..") {
+ sub(/\/[^\/]*$/, "", dir_path);
+ } else {
+ dir_path = dir_path "/" parts[i];
+ }
+ }
+ }
+ }
+ if (dir_path != "") {
+ # Use absolute path - ensure it starts with / to prevent relative interpretation
+ # Quote the path to handle spaces/special chars
+ cmd = "mkdir -p \"" dir_path "\" 2>/dev/null || true";
+ system(cmd);
+ }
+ dir_created = 1;
+ }
+ print s > hfile;
+}
+
+BEGIN {
+ nevents = 0;
+ nkeywords = 0;
+ nstructs = 0;
+ nenums = 0;
+ nflags = 0;
+ provider = "";
+ hfile = "";
+ opt_h = 0;
+ opt_c = 0;
+ mode = ""; # "producer" or "consumer"
+ collecting_struct = 0;
+ collecting_enum = 0;
+ collecting_flag = 0;
+ struct_line = "";
+ enum_line = "";
+ flag_line = "";
+ dir_created = 0;
+
+ # Process command line
+ if (ARGC < 2)
+ usage();
+
+ srcfile = ARGV[1];
+ # Don't remove ARGV[1] - AWK needs it to read the file
+
+ for (i = 2; i < ARGC; i++) {
+ if (ARGV[i] == "-h") {
+ opt_h = 1;
+ mode = "producer";
+ ARGV[i] = ""; # Remove from ARGV so it's not processed as a file
+ } else if (ARGV[i] == "-c") {
+ opt_c = 1;
+ mode = "consumer";
+ ARGV[i] = ""; # Remove from ARGV so it's not processed as a file
+ } else {
+ usage();
+ }
+ }
+
+ # Exactly one mode must be specified
+ if (!opt_h && !opt_c)
+ usage();
+ if (opt_h && opt_c)
+ usage();
+
+ # Determine output file name (will be set in END after PROVIDER is parsed)
+ # hfile is initialized above and will be set in END block
+
+ # Generate header file header (will be done in END after provider is known)
+}
+
+/^[ \t]*PROVIDER/ {
+ # Remove leading whitespace
+ sub(/^[ \t]+/, "");
+
+ # Normalize whitespace - collapse multiple spaces to single space
+ gsub(/[ \t]+/, " ");
+
+ if (NF < 2) {
+ die("Invalid PROVIDER line: expected PROVIDER <name>");
+ }
+
+ if (provider != "") {
+ die("PROVIDER already defined");
+ }
+
+ provider = $2;
+ # Convert to lowercase for filename
+ provider_lower = tolower(provider);
+ # Output to outdir if provided, otherwise current directory
+ # Filename depends on mode (producer vs consumer)
+ if (outdir != "") {
+ if (mode == "consumer") {
+ hfile = outdir "/" provider_lower "_eventlog_consumer.h";
+ } else {
+ hfile = outdir "/" provider_lower "_eventlog.h";
+ }
+ } else {
+ if (mode == "consumer") {
+ hfile = provider_lower "_eventlog_consumer.h";
+ } else {
+ hfile = provider_lower "_eventlog.h";
+ }
+ }
+
+ next;
+}
+
+collecting_struct == 1 {
+ # Continuation line for STRUCT
+ if (/^[ \t]/) {
+ # Remove leading whitespace and append
+ sub(/^[ \t]+/, "");
+ struct_line = struct_line " " $0;
+ next;
+ } else {
+ # End of continuation - process the accumulated line
+ collecting_struct = 0;
+ finalize_struct(struct_line);
+ struct_line = "";
+
+ # Now process the current line ($0) normally - don't call next
+ }
+}
+
+/^[ \t]*STRUCT/ {
+ # Start collecting STRUCT definition
+ if (collecting_struct == 1) {
+ # We were already collecting, process the previous one first
+ finalize_struct(struct_line);
+ struct_line = "";
+ collecting_struct = 0;
+ }
+
+ # Start new collection
+ collecting_struct = 1;
+ struct_line = $0;
+ sub(/^[ \t]+/, "", struct_line);
+ next;
+}
+
+/^[ \t]*KEYWORD/ {
+ # Remove leading whitespace
+ sub(/^[ \t]+/, "");
+
+ # Normalize whitespace - collapse multiple spaces to single space
+ gsub(/[ \t]+/, " ");
+
+ if (NF < 3) {
+ die("Invalid KEYWORD line: expected KEYWORD <name> <value>");
+ }
+
+ nkeywords++;
+ keywords[nkeywords, "name"] = $2;
+ keywords[nkeywords, "value"] = $3;
+
+ next;
+}
+
+collecting_enum == 1 {
+ # Continuation line for ENUM
+ if (/^[ \t]/) {
+ # Remove leading whitespace and append
+ sub(/^[ \t]+/, "");
+ enum_line = enum_line " " $0;
+ next;
+ } else {
+ # End of continuation - process the accumulated line
+ collecting_enum = 0;
+ line = enum_line;
+ enum_line = "";
+
+ # Normalize whitespace
+ gsub(/[ \t]+/, " ", line);
+ split(line, fields, " ");
+
+ if (length(fields) < 2) {
+ die("Invalid ENUM line: expected ENUM <name> [<value1>:<name1> ...]");
+ }
+
+ nenums++;
+ enums[nenums, "name"] = fields[2];
+
+ # Collect all value:name pairs
+ value_count = 0;
+ for (i = 3; i <= length(fields); i++) {
+ if (fields[i] == "")
+ continue;
+ split(fields[i], parts, ":");
+ if (length(parts) != 2) {
+ die("Invalid enum value definition: " fields[i] " (expected value:name)");
+ }
+ value_count++;
+ enums[nenums, "value", value_count, "num"] = parts[1];
+ enums[nenums, "value", value_count, "name"] = parts[2];
+ }
+ enums[nenums, "value_count"] = value_count;
+
+ # Now process the current line ($0) normally - don't call next
+ }
+}
+
+/^[ \t]*ENUM/ {
+ # Start collecting ENUM definition
+ if (collecting_enum == 1) {
+ # We were already collecting, process the previous one first
+ line = enum_line;
+ enum_line = "";
+ collecting_enum = 0;
+
+ gsub(/[ \t]+/, " ", line);
+ split(line, fields, " ");
+
+ if (length(fields) >= 2) {
+ nenums++;
+ enums[nenums, "name"] = fields[2];
+ value_count = 0;
+ for (i = 3; i <= length(fields); i++) {
+ if (fields[i] == "")
+ continue;
+ split(fields[i], parts, ":");
+ if (length(parts) == 2) {
+ value_count++;
+ enums[nenums, "value", value_count, "num"] = parts[1];
+ enums[nenums, "value", value_count, "name"] = parts[2];
+ }
+ }
+ enums[nenums, "value_count"] = value_count;
+ }
+ }
+
+ # Start new collection
+ collecting_enum = 1;
+ enum_line = $0;
+ sub(/^[ \t]+/, "", enum_line);
+ next;
+}
+
+collecting_flag == 1 {
+ # Continuation line for FLAG
+ if (/^[ \t]/) {
+ # Remove leading whitespace and append
+ sub(/^[ \t]+/, "");
+ flag_line = flag_line " " $0;
+ next;
+ } else {
+ # End of continuation - process the accumulated line
+ collecting_flag = 0;
+ line = flag_line;
+ flag_line = "";
+
+ # Normalize whitespace
+ gsub(/[ \t]+/, " ", line);
+ split(line, fields, " ");
+
+ if (length(fields) < 2) {
+ die("Invalid FLAG line: expected FLAG <name> [<value1>:<name1> ...]");
+ }
+
+ nflags++;
+ flags[nflags, "name"] = fields[2];
+
+ # Collect all value:name pairs
+ value_count = 0;
+ for (i = 3; i <= length(fields); i++) {
+ if (fields[i] == "")
+ continue;
+ split(fields[i], parts, ":");
+ if (length(parts) != 2) {
+ die("Invalid flag value definition: " fields[i] " (expected value:name)");
+ }
+ value_count++;
+ flags[nflags, "value", value_count, "num"] = parts[1];
+ flags[nflags, "value", value_count, "name"] = parts[2];
+ }
+ flags[nflags, "value_count"] = value_count;
+
+ # Now process the current line ($0) normally - don't call next
+ }
+}
+
+/^[ \t]*FLAG/ {
+ # Start collecting FLAG definition
+ # First, process any pending ENUM
+ if (collecting_enum == 1 && enum_line != "") {
+ line = enum_line;
+ enum_line = "";
+ collecting_enum = 0;
+
+ gsub(/[ \t]+/, " ", line);
+ split(line, fields, " ");
+
+ if (length(fields) >= 2) {
+ nenums++;
+ enums[nenums, "name"] = fields[2];
+ value_count = 0;
+ for (i = 3; i <= length(fields); i++) {
+ if (fields[i] == "")
+ continue;
+ split(fields[i], parts, ":");
+ if (length(parts) == 2) {
+ value_count++;
+ enums[nenums, "value", value_count, "num"] = parts[1];
+ enums[nenums, "value", value_count, "name"] = parts[2];
+ }
+ }
+ enums[nenums, "value_count"] = value_count;
+ }
+ }
+ if (collecting_flag == 1) {
+ # We were already collecting, process the previous one first
+ line = flag_line;
+ flag_line = "";
+ collecting_flag = 0;
+
+ gsub(/[ \t]+/, " ", line);
+ split(line, fields, " ");
+
+ if (length(fields) >= 2) {
+ nflags++;
+ flags[nflags, "name"] = fields[2];
+ value_count = 0;
+ for (i = 3; i <= length(fields); i++) {
+ if (fields[i] == "")
+ continue;
+ split(fields[i], parts, ":");
+ if (length(parts) == 2) {
+ value_count++;
+ flags[nflags, "value", value_count, "num"] = parts[1];
+ flags[nflags, "value", value_count, "name"] = parts[2];
+ }
+ }
+ flags[nflags, "value_count"] = value_count;
+ }
+ }
+
+ # Start new collection
+ collecting_flag = 1;
+ flag_line = $0;
+ sub(/^[ \t]+/, "", flag_line);
+ next;
+}
+
+collecting_event == 1 {
+ # Continuation line for EVENT
+ if (/^[ \t]/) {
+ # Remove leading whitespace and append
+ sub(/^[ \t]+/, "");
+ event_line = event_line " " $0;
+ next;
+ } else {
+ # End of continuation - process the accumulated line
+ collecting_event = 0;
+ line = event_line;
+ event_line = "";
+
+ # Normalize whitespace
+ gsub(/[ \t]+/, " ", line);
+ split(line, fields, " ");
+
+ if (length(fields) < 7) {
+ die("Invalid EVENT line: expected at least 7 fields (name id level keywords struct format)");
+ }
+
+ nevents++;
+ events[nevents, "name"] = fields[2];
+ events[nevents, "id"] = fields[3];
+ events[nevents, "level"] = fields[4];
+ events[nevents, "keywords"] = fields[5];
+ events[nevents, "struct"] = fields[6];
+
+ # Collect format string
+ format = "";
+ for (i = 7; i <= length(fields); i++) {
+ if (i > 7)
+ format = format " ";
+ format = format fields[i];
+ }
+ gsub(/^"/, "", format);
+ gsub(/"$/, "", format);
+ events[nevents, "format"] = format;
+
+ # Now process the current line ($0) normally - don't call next
+ }
+}
+
+/^[ \t]*EVENT/ {
+ # Start collecting EVENT definition
+ if (collecting_event == 1) {
+ # We were already collecting, process the previous one first
+ line = event_line;
+ event_line = "";
+ collecting_event = 0;
+
+ gsub(/[ \t]+/, " ", line);
+ split(line, fields, " ");
+
+ if (length(fields) >= 7) {
+ nevents++;
+ events[nevents, "name"] = fields[2];
+ events[nevents, "id"] = fields[3];
+ events[nevents, "level"] = fields[4];
+ events[nevents, "keywords"] = fields[5];
+ events[nevents, "struct"] = fields[6];
+ format = "";
+ for (i = 7; i <= length(fields); i++) {
+ if (i > 7)
+ format = format " ";
+ format = format fields[i];
+ }
+ gsub(/^"/, "", format);
+ gsub(/"$/, "", format);
+ events[nevents, "format"] = format;
+ }
+ }
+
+ # Start new collection
+ collecting_event = 1;
+ event_line = $0;
+ sub(/^[ \t]+/, "", event_line);
+ next;
+}
+
+/^[ \t]*\/\// {
+ # Skip C++ style comments (lines starting with //)
+ next;
+}
+
+/^[ \t]*#/ {
+ # Skip comments (lines starting with #)
+ next;
+}
+
+/^[ \t]*\/\*/ {
+ # Skip C-style comment blocks - just skip the line
+ next;
+}
+
+/^[ \t]*\*/ {
+ # Skip comment continuation lines
+ next;
+}
+
+/^[ \t]*$/ {
+ # Skip empty lines
+ next;
+}
+
+{
+ # Unknown line - skip silently (could be part of multi-line comments)
+ next;
+}
+
+function warn(msg)
+{
+ printf "eventlog_gen.awk: " msg "\n" > "/dev/stderr";
+}
+
+# Parse a STRUCT definition line (whitespace-normalized) into the structs[] tables.
+# Called from the grammar entry points and from END for trailing STRUCTs.
+function finalize_struct(struct_input, line, fields, parts, i, j, annotation, ftype, count_field, count_field_idx, max_str, max_val, bracket_pos, colon_pos, head, tail, bparts, fname)
+{
+ line = struct_input;
+ gsub(/[ \t]+/, " ", line);
+ split(line, fields, " ");
+
+ if (length(fields) < 2) {
+ die("Invalid STRUCT line: expected STRUCT <name> [<field1>:<type1> ...]");
+ }
+
+ nstructs++;
+ structs[nstructs, "name"] = fields[2];
+ structs[nstructs, "has_varlen"] = 0;
+
+ field_count = 0;
+ for (i = 3; i <= length(fields); i++) {
+ if (fields[i] == "")
+ continue;
+
+ # A VARLEN field uses the syntax: name:type[countfield:maxcount]
+ # The closing ']' may appear anywhere; we split on ':' only outside
+ # the square brackets by stripping the bracket portion first.
+ # Distinguish this from the existing fixed char array syntax
+ # (name:char[N]) by requiring a ':' inside the brackets.
+ bracket_pos = index(fields[i], "[");
+ if (bracket_pos > 0) {
+ # Everything before '[' is "name:type"; everything between
+ # '[' and ']' is "countfield:maxcount".
+ if (substr(fields[i], length(fields[i]), 1) != "]") {
+ die("Invalid bracketed field: " fields[i] " (missing ']')");
+ }
+ head = substr(fields[i], 1, bracket_pos - 1);
+ tail = substr(fields[i], bracket_pos + 1, length(fields[i]) - bracket_pos - 1);
+ # Fixed char[N] (no ':' inside brackets) falls through to the
+ # legacy "name:type" parsing below, which treats the full
+ # "type[N]" substring as the type spelling.
+ if (index(tail, ":") == 0) {
+ # Fall through to legacy fixed-size handling.
+ } else {
+ # VARLEN field
+ colon_pos = index(head, ":");
+ if (colon_pos == 0) {
+ die("Invalid varlen field: " fields[i] " (expected name:type[countfield:max])");
+ }
+ fname = substr(head, 1, colon_pos - 1);
+ ftype = substr(head, colon_pos + 1);
+ split(tail, bparts, ":");
+ if (length(bparts) != 2) {
+ die("Invalid varlen field: " fields[i] " (expected [countfield:max])");
+ }
+ count_field = bparts[1];
+ max_str = bparts[2];
+ if (max_str !~ /^[0-9]+$/ || (max_str + 0) == 0) {
+ die("Invalid varlen max: " max_str " (must be positive integer)");
+ }
+ # Disallow char[] and annotations on varlen fields.
+ if (ftype == "char" || match(ftype, /^char\[[0-9]+\]$/)) {
+ die("Varlen field " fname " may not use char/char[] element type");
+ }
+ field_count++;
+ structs[nstructs, "field", field_count, "name"] = fname;
+ structs[nstructs, "field", field_count, "type"] = ftype;
+ structs[nstructs, "field", field_count, "is_varlen"] = 1;
+ structs[nstructs, "field", field_count, "varlen_count"] = count_field;
+ structs[nstructs, "field", field_count, "varlen_max"] = max_val = (max_str + 0);
+ structs[nstructs, "has_varlen"] = 1;
+ structs[nstructs, "varlen_field_idx"] = field_count;
+ # Require varlen to be the last field in the struct.
+ # (We enforce this after the loop by checking field_count position.)
+ continue;
+ }
+ }
+
+ # Fixed field: split name:type[:annotation]
+ split(fields[i], parts, ":");
+ if (length(parts) < 2 || length(parts) > 3) {
+ die("Invalid field definition: " fields[i] " (expected field:type[:enum_or_flag_or_hex])");
+ }
+ field_count++;
+ structs[nstructs, "field", field_count, "name"] = parts[1];
+ structs[nstructs, "field", field_count, "type"] = parts[2];
+ if (length(parts) == 3) {
+ annotation = parts[3];
+ if (annotation == "hex") {
+ structs[nstructs, "field", field_count, "hex_format"] = 1;
+ } else if (annotation == "ntohs") {
+ structs[nstructs, "field", field_count, "ntohs"] = 1;
+ } else if (substr(annotation, 1, 5) == "enum_") {
+ structs[nstructs, "field", field_count, "enum_type"] = substr(annotation, 6);
+ } else if (substr(annotation, 1, 5) == "flag_") {
+ structs[nstructs, "field", field_count, "flag_type"] = substr(annotation, 6);
+ } else {
+ die("Invalid annotation: " annotation " (expected hex, ntohs, enum_<name>, or flag_<name>)");
+ }
+ }
+ }
+ structs[nstructs, "field_count"] = field_count;
+
+ # Validate varlen placement/references.
+ if (structs[nstructs, "has_varlen"]) {
+ if (structs[nstructs, "varlen_field_idx"] != field_count) {
+ die("Varlen field in STRUCT " structs[nstructs, "name"] " must be the last field");
+ }
+ count_field = structs[nstructs, "field", field_count, "varlen_count"];
+ count_field_idx = 0;
+ for (j = 1; j < field_count; j++) {
+ if (structs[nstructs, "field", j, "name"] == count_field) {
+ count_field_idx = j;
+ break;
+ }
+ }
+ if (count_field_idx == 0) {
+ die("Varlen count field '" count_field "' not found in STRUCT " structs[nstructs, "name"]);
+ }
+ # The count field must be an unsigned integral type (uint8/16/32/64 or compatible).
+ ftype = structs[nstructs, "field", count_field_idx, "type"];
+ if (ftype != "uint8_t" && ftype != "uint16_t" && ftype != "uint32_t" &&
+ ftype != "uint64_t" && ftype != "u_char" && ftype != "u_short" &&
+ ftype != "u_int" && ftype != "u_long" && ftype != "size_t") {
+ die("Varlen count field '" count_field "' must be an unsigned scalar (got " ftype ")");
+ }
+ structs[nstructs, "varlen_count_idx"] = count_field_idx;
+ }
+}
+
+function get_type_size(type)
+{
+ # Map C types to their sizes (assuming 64-bit platform)
+ if (type == "uint8_t" || type == "int8_t" || type == "char" || type == "u_char")
+ return 1;
+ if (type == "uint16_t" || type == "int16_t" || type == "short" || type == "u_short")
+ return 2;
+ if (type == "uint32_t" || type == "int32_t" || type == "int" || type == "u_int" || type == "lwpid_t")
+ return 4;
+ if (type == "uint64_t" || type == "int64_t" || type == "long" || type == "u_long" || type == "size_t")
+ return 8;
+ if (type == "uintptr_t" || type == "intptr_t")
+ return 8;
+ if (type == "void*" || type == "void *")
+ return 8; # Pointer size on 64-bit platform
+ if (type == "in_addr_t" || type == "struct in_addr")
+ return 4; # IPv4 address is 4 bytes
+ if (type == "in6_addr_t" || type == "struct in6_addr")
+ return 16; # IPv6 address is 16 bytes
+ # char[N] - fixed-size char array (e.g., char[64])
+ if (match(type, /^char\[[0-9]+\]$/)) {
+ sub(/^char\[/, "", type);
+ sub(/\]$/, "", type);
+ return type + 0;
+ }
+ # Default to 4 bytes for unknown types (conservative)
+ warn("Unknown type size for: " type ", assuming 4 bytes");
+ return 4;
+}
+
+function get_printf_format(field_type, enum_type, flag_type, hex_format)
+{
+ # Return printf format specifier based on field type
+ if (enum_type != "" || flag_type != "")
+ return "%s"; # Enum/flag fields are converted to strings
+ if (field_type == "in_addr_t" || field_type == "struct in_addr")
+ return "%s"; # IP addresses are converted to strings
+ if (field_type == "in6_addr_t" || field_type == "struct in6_addr")
+ return "%s"; # IPv6 addresses are converted to strings
+ if (field_type == "void*" || field_type == "void *")
+ return "%p"; # Pointers
+ # char[N] - fixed-size char array, treat as string
+ if (match(field_type, /^char\[[0-9]+\]$/))
+ return "%s"; # Char arrays displayed as strings
+ # Handle hex format if requested
+ if (hex_format) {
+ if (field_type == "uint8_t" || field_type == "u_char")
+ return "%x";
+ if (field_type == "uint16_t" || field_type == "u_short")
+ return "%x";
+ if (field_type == "uint32_t" || field_type == "u_int" || field_type == "lwpid_t")
+ return "%x";
+ if (field_type == "uint64_t" || field_type == "u_long" || field_type == "size_t")
+ return "%lx";
+ }
+ if (field_type == "uint8_t" || field_type == "u_char")
+ return "%u";
+ if (field_type == "int8_t" || field_type == "char")
+ return "%d";
+ if (field_type == "uint16_t" || field_type == "u_short")
+ return "%u";
+ if (field_type == "int16_t" || field_type == "short")
+ return "%d";
+ if (field_type == "uint32_t" || field_type == "u_int" || field_type == "lwpid_t")
+ return "%u";
+ if (field_type == "int32_t" || field_type == "int")
+ return "%d";
+ if (field_type == "uint64_t" || field_type == "u_long" || field_type == "size_t")
+ return "%lu";
+ if (field_type == "int64_t" || field_type == "long")
+ return "%ld";
+ # Default to %u for unknown types
+ warn("Unknown printf format for type: " field_type ", using %u");
+ return "%u";
+}
+
+END {
+ # Process any remaining collected STRUCT
+ if (collecting_struct == 1 && struct_line != "") {
+ finalize_struct(struct_line);
+ struct_line = "";
+ collecting_struct = 0;
+ }
+
+ # Process any remaining collected EVENT
+ if (collecting_event == 1 && event_line != "") {
+ line = event_line;
+ gsub(/[ \t]+/, " ", line);
+ split(line, fields, " ");
+
+ if (length(fields) >= 7) {
+ nevents++;
+ events[nevents, "name"] = fields[2];
+ events[nevents, "id"] = fields[3];
+ events[nevents, "level"] = fields[4];
+ events[nevents, "keywords"] = fields[5];
+ events[nevents, "struct"] = fields[6];
+ format = "";
+ for (i = 7; i <= length(fields); i++) {
+ if (i > 7)
+ format = format " ";
+ format = format fields[i];
+ }
+ gsub(/^"/, "", format);
+ gsub(/"$/, "", format);
+ events[nevents, "format"] = format;
+ }
+ }
+
+ # Process any remaining collected ENUM
+ if (collecting_enum == 1 && enum_line != "") {
+ line = enum_line;
+ gsub(/[ \t]+/, " ", line);
+ split(line, fields, " ");
+
+ if (length(fields) >= 2) {
+ nenums++;
+ enums[nenums, "name"] = fields[2];
+ value_count = 0;
+ for (i = 3; i <= length(fields); i++) {
+ if (fields[i] == "")
+ continue;
+ split(fields[i], parts, ":");
+ if (length(parts) == 2) {
+ value_count++;
+ enums[nenums, "value", value_count, "num"] = parts[1];
+ enums[nenums, "value", value_count, "name"] = parts[2];
+ }
+ }
+ enums[nenums, "value_count"] = value_count;
+ }
+ }
+
+ # Process any remaining collected FLAG
+ if (collecting_flag == 1 && flag_line != "") {
+ line = flag_line;
+ gsub(/[ \t]+/, " ", line);
+ split(line, fields, " ");
+
+ if (length(fields) >= 2) {
+ nflags++;
+ flags[nflags, "name"] = fields[2];
+ value_count = 0;
+ for (i = 3; i <= length(fields); i++) {
+ if (fields[i] == "")
+ continue;
+ split(fields[i], parts, ":");
+ if (length(parts) == 2) {
+ value_count++;
+ flags[nflags, "value", value_count, "num"] = parts[1];
+ flags[nflags, "value", value_count, "name"] = parts[2];
+ }
+ }
+ flags[nflags, "value_count"] = value_count;
+ }
+ }
+
+ if (provider == "") {
+ die("PROVIDER must be defined at the beginning of the schema file");
+ }
+
+ if (nevents == 0) {
+ die("No events found in schema file");
+ }
+
+ # Add KEYWORD SESSION for reserved events (0x80000000 = EVENTLOG_KEYWORD_SESSION)
+ for (i = 1; i <= nkeywords; i++) {
+ if (keywords[i, "name"] == "SESSION")
+ break;
+ }
+ if (i > nkeywords) {
+ nkeywords++;
+ keywords[nkeywords, "name"] = "SESSION";
+ keywords[nkeywords, "value"] = "0x80000000";
+ }
+ # Add struct for SESSION_CREATE payload if schema does not define it
+ has_session_create = 0;
+ for (si = 1; si <= nstructs; si++) {
+ if (structs[si, "name"] == "SESSION_CREATE") {
+ has_session_create = 1;
+ break;
+ }
+ }
+ if (!has_session_create) {
+ nstructs++;
+ structs[nstructs, "name"] = "SESSION_CREATE";
+ structs[nstructs, "field", 1, "name"] = "_unused";
+ structs[nstructs, "field", 1, "type"] = "uint8_t";
+ structs[nstructs, "field_count"] = 1;
+ }
+ # SESSION_END has no payload - uses NONE struct
+ # Add reserved events with fixed IDs (UINT32_MAX-1, UINT32_MAX) if not in schema
+ has_session_create_evt = 0;
+ has_session_end_evt = 0;
+ for (ei = 1; ei <= nevents; ei++) {
+ if (events[ei, "name"] == "SESSION_CREATE") has_session_create_evt = 1;
+ if (events[ei, "name"] == "SESSION_END") has_session_end_evt = 1;
+ }
+ if (!has_session_create_evt) {
+ nevents++;
+ events[nevents, "name"] = "SESSION_CREATE";
+ events[nevents, "id"] = 4294967294; # UINT32_MAX-1
+ events[nevents, "level"] = "INFO";
+ events[nevents, "keywords"] = "SESSION";
+ events[nevents, "struct"] = "SESSION_CREATE";
+ events[nevents, "format"] = "Session created";
+ }
+ if (!has_session_end_evt) {
+ nevents++;
+ events[nevents, "name"] = "SESSION_END";
+ events[nevents, "id"] = 4294967295; # UINT32_MAX
+ events[nevents, "level"] = "INFO";
+ events[nevents, "keywords"] = "SESSION";
+ events[nevents, "struct"] = "NONE";
+ events[nevents, "format"] = "Session ended";
+ }
+
+ if (hfile == "") {
+ provider_lower = tolower(provider);
+ # If outdir was provided via -v outdir=..., use it
+ if (outdir != "") {
+ if (mode == "consumer") {
+ hfile = outdir "/" provider_lower "_eventlog_consumer.h";
+ } else {
+ hfile = outdir "/" provider_lower "_eventlog.h";
+ }
+ } else {
+ if (mode == "consumer") {
+ hfile = provider_lower "_eventlog_consumer.h";
+ } else {
+ hfile = provider_lower "_eventlog.h";
+ }
+ }
+ }
+
+ # Generate header file header
+ generated = "@" "generated";
+ printh("/*");
+ printh(" * THIS FILE AUTOMATICALLY GENERATED. DO NOT EDIT.");
+ printh(" *");
+ printh(" * Generated from " srcfile);
+ printh(" * by eventlog_gen.awk");
+ printh(" */");
+ printh("");
+ printh("#ifndef _" toupper(provider) "_EVENTLOG_H_");
+ printh("#define _" toupper(provider) "_EVENTLOG_H_");
+ printh("");
+ printh("#include <sys/eventlog.h>");
+ # Check if any struct uses in_addr, in6_addr, char[N], or declares a
+ # trailing variable-length array.
+ needs_inet = 0;
+ needs_inet6 = 0;
+ needs_string = 0;
+ needs_iovec = 0;
+ for (i = 1; i <= nstructs; i++) {
+ if (structs[i, "has_varlen"] == 1)
+ needs_iovec = 1;
+ field_count = structs[i, "field_count"];
+ for (j = 1; j <= field_count; j++) {
+ field_type = structs[i, "field", j, "type"];
+ if (field_type == "in_addr_t" || field_type == "struct in_addr")
+ needs_inet = 1;
+ if (field_type == "in6_addr_t" || field_type == "struct in6_addr")
+ needs_inet6 = 1;
+ if (match(field_type, /^char\[[0-9]+\]$/))
+ needs_string = 1;
+ }
+ }
+ if (needs_inet || needs_inet6)
+ printh("#include <netinet/in.h>");
+ # libkern (strncpy, bzero) only for kernel producer; consumer uses string.h
+ if (needs_string && mode == "producer")
+ printh("#include <sys/libkern.h>");
+ # struct iovec for the gather write path (producer-only).
+ if (needs_iovec && mode == "producer")
+ printh("#include <sys/uio.h>");
+ printh("");
+
+ # Generate provider instance and macros at the top
+ provider_upper = toupper(provider);
+ provider_lower = tolower(provider);
+
+ # Calculate maximum event size by finding the largest struct.
+ # For varlen structs, the max includes the tail: sizeof(fixed head) + max_elements * sizeof(element).
+ max_size = 0;
+ for (i = 1; i <= nevents; i++) {
+ struct_name = events[i, "struct"];
+ if (struct_name == "NONE")
+ continue;
+ struct_idx = 0;
+ for (j = 1; j <= nstructs; j++) {
+ if (structs[j, "name"] == struct_name) {
+ struct_idx = j;
+ break;
+ }
+ }
+ if (struct_idx == 0) {
+ die("Struct " struct_name " not found for event " events[i, "name"]);
+ }
+ field_count = structs[struct_idx, "field_count"];
+ event_size = 0;
+ for (j = 1; j <= field_count; j++) {
+ field_type = structs[struct_idx, "field", j, "type"];
+ if (structs[struct_idx, "field", j, "is_varlen"] == 1) {
+ event_size += get_type_size(field_type) * structs[struct_idx, "field", j, "varlen_max"];
+ } else {
+ event_size += get_type_size(field_type);
+ }
+ }
+ if (event_size > max_size) {
+ max_size = event_size;
+ }
+ }
+
+
+ # Generate struct type definitions.
+ # For varlen structs the trailing array is NOT declared as a C member --
+ # callers access it through the generated accessor helper. We emit a
+ # comment documenting the wire layout.
+ printh("/* Event data structures */");
+ for (i = 1; i <= nstructs; i++) {
+ struct_name = structs[i, "name"];
+ field_count = structs[i, "field_count"];
+ # For structs with a VARLEN trailing array, force struct alignment
+ # (and therefore sizeof) to be a multiple of alignof(elem_type) so
+ # the trailing array starts on an aligned offset. Without this, a
+ # head that ends at e.g. offset 12 followed by a uint64[] trailer
+ # would produce an unaligned cast (-Wcast-align) and, worse, a
+ # real unaligned access on strict-alignment architectures.
+ struct_align = 0;
+ if (structs[i, "has_varlen"] == 1) {
+ vidx = structs[i, "varlen_field_idx"];
+ struct_align = get_type_size(structs[i, "field", vidx, "type"]);
+ }
+ if (struct_align > 0) {
+ printh("struct __aligned(" struct_align ") " \
+ provider_lower "_eventlog_" tolower(struct_name) " {");
+ } else {
+ printh("struct " provider_lower "_eventlog_" tolower(struct_name) " {");
+ }
+ for (j = 1; j <= field_count; j++) {
+ field_name = structs[i, "field", j, "name"];
+ field_type = structs[i, "field", j, "type"];
+ if (structs[i, "field", j, "is_varlen"] == 1) {
+ # Documentation only - the trailing array lives in the wire payload,
+ # not in this C struct. Use the accessor helper to read it.
+ printh("\t/* Followed on the wire by " field_type " " field_name \
+ "[" structs[i, "field", j, "varlen_count"] "]; " \
+ "max " structs[i, "field", j, "varlen_max"] " elements */");
+ continue;
+ }
+ # Map special types to their C equivalents
+ if (field_type == "in_addr_t")
+ field_type = "struct in_addr";
+ else if (field_type == "in6_addr_t")
+ field_type = "struct in6_addr";
+ # char[N] -> char field_name[N];
+ if (match(field_type, /^char\[([0-9]+)\]$/)) {
+ array_size = substr(field_type, RSTART + 5, RLENGTH - 6);
+ printh("\tchar\t" field_name "[" array_size "];");
+ } else {
+ printh("\t" field_type "\t" field_name ";");
+ }
+ }
+ printh("};");
+ # Emit a MAX constant + accessor helper for any varlen field.
+ if (structs[i, "has_varlen"] == 1) {
+ vidx = structs[i, "varlen_field_idx"];
+ vname = structs[i, "field", vidx, "name"];
+ vtype = structs[i, "field", vidx, "type"];
+ vcount = structs[i, "field", vidx, "varlen_count"];
+ vmax_define = toupper(provider) "_EVENTLOG_" toupper(struct_name) \
+ "_" toupper(vname) "_MAX";
+ printh("#define\t" vmax_define "\t" \
+ structs[i, "field", vidx, "varlen_max"]);
+ printh("");
+ printh("/*");
+ printh(" * Read the trailing " vname "[] array from a " struct_name " wire payload.");
+ printh(" * Returns a pointer to the first element, or NULL if the payload is too");
+ printh(" * small to hold the claimed count. Callers should use evt->" vcount);
+ printh(" * (already bounded to " vmax_define ") as the element count.");
+ printh(" */");
+ printh("static inline const " vtype " *");
+ printh(provider_lower "_eventlog_" tolower(struct_name) "_" tolower(vname) \
+ "(const struct " provider_lower "_eventlog_" tolower(struct_name) " *evt, size_t payload_size)");
+ printh("{");
+ printh("\tsize_t __head = sizeof(*evt);");
+ printh("\tsize_t __n = (size_t)evt->" vcount ";");
+ printh("\tif (__n > " vmax_define ")");
+ printh("\t\treturn NULL;");
+ printh("\tif (payload_size < __head + __n * sizeof(" vtype "))");
+ printh("\t\treturn NULL;");
+ # Cast via const void * to silence -Wcast-align. The struct
+ # definition above carries __aligned(sizeof(" vtype ")) so the
+ # trailing array is guaranteed to start on an aligned offset.
+ printh("\treturn (const " vtype " *)(const void *)((const char *)evt + __head);");
+ printh("}");
+ }
+ printh("");
+ }
+
+ # Generate keyword flag definitions
+ printh("/* Event keyword flags */");
+ # Keywords defined in schema
+ next_bit = 0x0001; # Start from first bit
+ for (i = 1; i <= nkeywords; i++) {
+ # Convert value to hex if it's numeric, otherwise use as-is
+ value = keywords[i, "value"];
+ if (value ~ /^0x[0-9a-fA-F]+$/) {
+ # Already hex - use as-is
+ printh("#define\t" toupper(provider) "_EVENTLOG_KEYWORD_" keywords[i, "name"] "\t" value);
+ } else if (value ~ /^[0-9]+$/) {
+ # Decimal - if it's a small number (like 1), treat as relative bit position
+ # Otherwise use the value directly converted to hex
+ num_value = value + 0;
+ if (num_value < 16) {
+ # Small number - treat as relative bit position
+ # 1 = first bit (0x0001), 2 = second bit (0x0002), etc.
+ # Calculate: 2^(num_value-1)
+ bit_shift = num_value - 1;
+ bit_value = 2 ^ bit_shift;
+ printh("#define\t" toupper(provider) "_EVENTLOG_KEYWORD_" keywords[i, "name"] "\t0x" sprintf("%04x", bit_value));
+ } else {
+ # Large number - use directly as hex
+ printh("#define\t" toupper(provider) "_EVENTLOG_KEYWORD_" keywords[i, "name"] "\t0x" sprintf("%04x", num_value));
+ }
+ } else {
+ # Use as-is (might be a constant)
+ printh("#define\t" toupper(provider) "_EVENTLOG_KEYWORD_" keywords[i, "name"] "\t" value);
+ }
+ }
+ printh("");
+
+ # Generate enum constant definitions (needed for both modes)
+ if (nenums > 0) {
+ printh("/* Enum constant definitions */");
+ for (i = 1; i <= nenums; i++) {
+ enum_name = enums[i, "name"];
+ value_count = enums[i, "value_count"];
+ for (j = 1; j <= value_count; j++) {
+ value_num = enums[i, "value", j, "num"];
+ value_name = enums[i, "value", j, "name"];
+ printh("#define\t" toupper(provider) "_EVENTLOG_" toupper(enum_name) "_" toupper(value_name) "\t" value_num);
+ }
+ }
+ printh("");
+ }
+
+ # Generate flag constant definitions (needed for both modes)
+ if (nflags > 0) {
+ printh("/* Flag constant definitions */");
+ for (i = 1; i <= nflags; i++) {
+ flag_name = flags[i, "name"];
+ value_count = flags[i, "value_count"];
+ for (j = 1; j <= value_count; j++) {
+ value_num = flags[i, "value", j, "num"];
+ value_name = flags[i, "value", j, "name"];
+ printh("#define\t" toupper(provider) "_EVENTLOG_FLAG_" toupper(value_name) "\t" value_num);
+ }
+ }
+ printh("");
+ }
+
+ # Generate event ID constants (both modes)
+ printh("/* Event ID constants */");
+ for (i = 1; i <= nevents; i++) {
+ printh("#define\t" toupper(provider) "_EVENTLOG_" events[i, "name"] "_ID\t" events[i, "id"]);
+ }
+ printh("");
+
+ # Generate event definitions (producer mode only)
+ if (mode == "producer") {
+ printh("/* Events */");
+ printh("");
+
+ for (i = 1; i <= nevents; i++) {
+ # Convert level to enum value
+ level_enum = "EVENTLOG_LEVEL_INFO";
+ if (events[i, "level"] == "ERROR")
+ level_enum = "EVENTLOG_LEVEL_ERROR";
+ else if (events[i, "level"] == "WARN")
+ level_enum = "EVENTLOG_LEVEL_WARN";
+ else if (events[i, "level"] == "VERBOSE")
+ level_enum = "EVENTLOG_LEVEL_VERBOSE";
+ else if (events[i, "level"] == "TRACE")
+ level_enum = "EVENTLOG_LEVEL_TRACE";
+
+ # Parse keywords into keyword_flags
+ keywords_str = events[i, "keywords"];
+ keyword_flags = "";
+ # Only use keywords explicitly defined in schema
+ for (j = 1; j <= nkeywords; j++) {
+ keyword_name = keywords[j, "name"];
+ if (index(keywords_str, keyword_name) > 0)
+ keyword_flags = keyword_flags " | " toupper(provider) "_EVENTLOG_KEYWORD_" keyword_name;
+ }
+
+ # Remove leading " | "
+ if (keyword_flags != "") {
+ # Remove leading space and " | " (substring starting at position 4)
+ keyword_flags = substr(keyword_flags, 4);
+ } else {
+ keyword_flags = "0";
+ }
+
+ # Generate defines for event
+ event_name = events[i, "name"];
+ struct_name = events[i, "struct"];
+ provider_upper = toupper(provider);
+ provider_lower = tolower(provider);
+
+ # Generate comment block for this event
+ printh("/*");
+ printh(" * " provider_upper " " event_name " Event");
+ printh(" */");
+ printh("");
+
+ # Generate enabled macro - uses session effective_level/effective_keywords
+ printh("#define " provider_upper "_EVENTLOG_" event_name "_ENABLED(__session) \\");
+ printh("\t((__session != NULL) && \\");
+ printh("\t ((__session)->effective_level >= " level_enum ") && \\");
+ printh("\t (((__session)->effective_keywords & (" keyword_flags ")) != 0))");
+ printh("");
+
+ if (struct_name == "NONE") {
+ # No-payload event: macros take only __session
+ printh("/* struct eventlog_session *session */");
+ printh("#define " provider_upper "_EVENTLOG_" event_name "_LOG_ALWAYS(__session) \\");
+ printh("\tdo { \\");
+ printh("\t\teventlog_event_write(__session, " events[i, "id"] ", " level_enum ", (" keyword_flags "), NULL, 0); \\");
+ printh("\t} while (0)");
+ printh("");
+ printh("/* struct eventlog_session *session */");
+ printh("#define " provider_upper "_EVENTLOG_" event_name "_LOG(__session) \\");
+ printh("\tdo { \\");
+ printh("\t\tif (" provider_upper "_EVENTLOG_" event_name "_ENABLED(__session)) { \\");
+ printh("\t\t\t" provider_upper "_EVENTLOG_" event_name "_LOG_ALWAYS(__session); \\");
+ printh("\t\t} \\");
+ printh("\t} while (0)");
+ printh("");
+ printh("#define " provider_upper "_EVENTLOG_" event_name "_FORMAT \"" events[i, "format"] "\"");
+ printh("");
+ } else {
+ # Find the struct definition for this event
+ struct_idx = 0;
+ for (j = 1; j <= nstructs; j++) {
+ if (structs[j, "name"] == struct_name) {
+ struct_idx = j;
+ break;
+ }
+ }
+
+ if (struct_idx == 0) {
+ die("Struct " struct_name " not found for event " event_name);
+ }
+
+ # Generate _LOG_ALWAYS and _LOG macros
+ field_count = structs[struct_idx, "field_count"];
+ has_varlen = (structs[struct_idx, "has_varlen"] == 1);
+ varlen_idx = has_varlen ? structs[struct_idx, "varlen_field_idx"] : 0;
+
+ # For varlen events, the last parameter is a pointer to a user-supplied
+ # source array of elements; the count is taken from the count_field
+ # parameter already in the signature. We do not append an extra count
+ # parameter -- the count is whichever scalar field the schema named.
+
+ # Build parameter list (without types) and comment list (with types)
+ # Use __ prefix for all parameters to avoid collisions with struct field names
+ param_list = "__session";
+ # Comment uses non-prefixed names for readability
+ param_comment = "struct eventlog_session *session";
+ struct_type_name = provider_lower "_eventlog_" tolower(struct_name);
+
+ for (j = 1; j <= field_count; j++) {
+ field_name = structs[struct_idx, "field", j, "name"];
+ field_type = structs[struct_idx, "field", j, "type"];
+
+ # Add to parameter list with __ prefix
+ param_list = param_list ", __" field_name;
+ # Comment uses non-prefixed names for readability
+ if (structs[struct_idx, "field", j, "is_varlen"] == 1) {
+ param_comment = param_comment ", const " field_type " *" field_name;
+ } else {
+ param_comment = param_comment ", " field_type " " field_name;
+ }
+ }
+
+ # Generate _LOG_ALWAYS macro (does the actual logging)
+ # Use __ prefixed parameter names to avoid collisions with struct field names
+ # Check if any field is char[N] (requires strncpy, not direct assignment)
+ has_char_array = 0;
+ for (j = 1; j <= field_count; j++) {
+ field_type = structs[struct_idx, "field", j, "type"];
+ if (match(field_type, /^char\[[0-9]+\]$/)) {
+ has_char_array = 1;
+ break;
+ }
+ }
+
+ printh("/* " param_comment " */");
+ printh("#define " provider_upper "_EVENTLOG_" event_name "_LOG_ALWAYS(" param_list ") \\");
+ printh("\tdo { \\");
+ if (has_varlen) {
+ # Build a 2-element iovec: [0] = the fixed head on the
+ # stack, [1] = the caller's source array. The framework's
+ # gather write path copies directly into the subscriber
+ # ring buffer, avoiding the pre-copy and the worst-case
+ # stack footprint of a composite struct.
+ varlen_field_name = structs[struct_idx, "field", varlen_idx, "name"];
+ varlen_elem_type = structs[struct_idx, "field", varlen_idx, "type"];
+ varlen_count_name = structs[struct_idx, "field", varlen_idx, "varlen_count"];
+ varlen_max_define = provider_upper "_EVENTLOG_" toupper(struct_name) \
+ "_" toupper(varlen_field_name) "_MAX";
+
+ # All declarations first so the expansion is valid in a
+ # nested block in strict C modes.
+ printh("\t\tstruct " struct_type_name " __head; \\");
+ printh("\t\tstruct iovec __iov[2]; \\");
+ printh("\t\tsize_t __n = (size_t)(__" varlen_count_name "); \\");
+ printh("\t\tif (__n > " varlen_max_define ") \\");
+ printh("\t\t\t__n = " varlen_max_define "; \\");
+ if (has_char_array) {
+ printh("\t\tbzero(&__head, sizeof(__head)); \\");
+ }
+ # Assign head fields (all fields except the varlen one).
+ for (j = 1; j < field_count; j++) {
+ field_name = structs[struct_idx, "field", j, "name"];
+ field_type = structs[struct_idx, "field", j, "type"];
+ if (match(field_type, /^char\[[0-9]+\]$/)) {
+ printh("\t\tstrncpy(__head." field_name ", (__" field_name ") ? (__" field_name ") : \"\", sizeof(__head." field_name ") - 1); \\");
+ printh("\t\t__head." field_name "[sizeof(__head." field_name ") - 1] = '\\0'; \\");
+ } else if (field_name == varlen_count_name) {
+ # Overwrite the count with the clamped value so
+ # the wire layout matches what we actually pack.
+ printh("\t\t__head." field_name " = __n; \\");
+ } else {
+ printh("\t\t__head." field_name " = (__" field_name "); \\");
+ }
+ }
+ printh("\t\t__iov[0].iov_base = (void *)&__head; \\");
+ printh("\t\t__iov[0].iov_len = sizeof(__head); \\");
+ printh("\t\t__iov[1].iov_base = __DECONST(void *, (__" varlen_field_name ")); \\");
+ printh("\t\t__iov[1].iov_len = __n * sizeof(" varlen_elem_type "); \\");
+ printh("\t\teventlog_event_write_gather(__session, " events[i, "id"] ", " level_enum ", (" keyword_flags "), __iov, 2); \\");
+ } else if (has_char_array) {
+ printh("\t\tstruct " struct_type_name " __evt; \\");
+ printh("\t\tbzero(&__evt, sizeof(__evt)); \\");
+ for (j = 1; j <= field_count; j++) {
+ field_name = structs[struct_idx, "field", j, "name"];
+ field_type = structs[struct_idx, "field", j, "type"];
+ if (match(field_type, /^char\[[0-9]+\]$/)) {
+ printh("\t\tstrncpy(__evt." field_name ", (__" field_name ") ? (__" field_name ") : \"\", sizeof(__evt." field_name ") - 1); \\");
+ printh("\t\t__evt." field_name "[sizeof(__evt." field_name ") - 1] = '\\0'; \\");
+ } else {
+ printh("\t\t__evt." field_name " = (__" field_name "); \\");
+ }
+ }
+ printh("\t\teventlog_event_write(__session, " events[i, "id"] ", " level_enum ", (" keyword_flags "), &__evt, sizeof(__evt)); \\");
+ } else {
+ printh("\t\tstruct " struct_type_name " __evt = { \\");
+ for (j = 1; j <= field_count; j++) {
+ field_name = structs[struct_idx, "field", j, "name"];
+ if (j < field_count) {
+ printh("\t\t\t." field_name " = (__" field_name "), \\");
+ } else {
+ printh("\t\t\t." field_name " = (__" field_name ") \\");
+ }
+ }
+ printh("\t\t}; \\");
+ printh("\t\teventlog_event_write(__session, " events[i, "id"] ", " level_enum ", (" keyword_flags "), &__evt, sizeof(__evt)); \\");
+ }
+ printh("\t} while (0)");
+ printh("");
+
+ # Generate _LOG macro (checks enabled and calls _LOG_ALWAYS)
+ # Use same __ prefixed parameters, pass directly to _LOG_ALWAYS
+ printh("/* " param_comment " */");
+ printh("#define " provider_upper "_EVENTLOG_" event_name "_LOG(" param_list ") \\");
+ printh("\tdo { \\");
+ printh("\t\tif (" provider_upper "_EVENTLOG_" event_name "_ENABLED(__session)) { \\");
+ printh("\t\t\t" provider_upper "_EVENTLOG_" event_name "_LOG_ALWAYS(" param_list "); \\");
+ printh("\t\t} \\");
+ printh("\t} while (0)");
+ printh("");
+ # Generate format string constant - for producer mode, just store the original format string
+ # The consumer mode will convert %N placeholders to printf format specifiers
+ printh("#define " provider_upper "_EVENTLOG_" event_name "_FORMAT \"" events[i, "format"] "\"");
+ printh("");
+ }
+ }
+
+ # SESSION_END/SESSION_CREATE use fixed IDs from eventlog.h - no producer defines needed
+ }
+
+ # Generate enum/flag lookup functions for userland (consumer mode only)
+ if (mode == "consumer") {
+ # SESSION_END/SESSION_CREATE use fixed EVENTLOG_SESSION_END_ID, EVENTLOG_SESSION_CREATE_ID from eventlog.h
+ printh("#include <stdio.h>");
+ printh("#include <string.h>");
+ # Check if we need arpa/inet.h and sys/socket.h for IP address formatting or ntohs
+ needs_inet_header = 0;
+ for (i = 1; i <= nstructs; i++) {
+ field_count = structs[i, "field_count"];
+ for (j = 1; j <= field_count; j++) {
+ field_type = structs[i, "field", j, "type"];
+ if (field_type == "in_addr_t" || field_type == "struct in_addr" || field_type == "in6_addr_t" || field_type == "struct in6_addr") {
+ needs_inet_header = 1;
+ break;
+ }
+ if (structs[i, "field", j, "ntohs"] == 1) {
+ needs_inet_header = 1;
+ break;
+ }
+ }
+ if (needs_inet_header)
+ break;
+ }
+ if (needs_inet_header) {
+ printh("#include <sys/socket.h>");
+ printh("#include <arpa/inet.h>");
+ printh("#include <netinet/in.h>");
+ }
+ printh("");
+ printh("/*");
+ printh(" * Format string constants");
+ printh(" */");
+ printh("");
+ # Format string constants are generated in the formatting function below
+ # where %N placeholders are converted to printf format specifiers
+ printh("");
+ printh("/*");
+ printh(" * Enum and flag lookup functions");
+ printh(" * These functions convert numeric enum/flag values to strings");
+ printh(" */");
+ printh("");
+
+ # Generate enum lookup functions
+ for (i = 1; i <= nenums; i++) {
+ enum_name = enums[i, "name"];
+ value_count = enums[i, "value_count"];
+ printh("/*");
+ printh(" * Lookup enum value for " enum_name);
+ printh(" * Returns string representation or NULL if not found");
+ printh(" */");
+ printh("static inline const char *");
+ printh(provider_lower "_eventlog_enum_" tolower(enum_name) "_to_string(uint32_t value)");
+ printh("{");
+ printh("\tswitch (value) {");
+ for (j = 1; j <= value_count; j++) {
+ value_num = enums[i, "value", j, "num"];
+ value_name = enums[i, "value", j, "name"];
+ printh("\tcase " value_num ":");
+ printh("\t\treturn \"" value_name "\";");
+ }
+ printh("\tdefault:");
+ printh("\t\treturn NULL;");
+ printh("\t}");
+ printh("}");
+ printh("");
+ }
+
+ # Generate flag lookup functions
+ for (i = 1; i <= nflags; i++) {
+ flag_name = flags[i, "name"];
+ value_count = flags[i, "value_count"];
+ printh("/*");
+ printh(" * Lookup flag value for " flag_name);
+ printh(" * Returns string representation of combined flags or NULL if empty");
+ printh(" * Format: \"FLAG1|FLAG2|...\"");
+ printh(" */");
+ printh("static inline int");
+ printh(provider_lower "_eventlog_flag_" tolower(flag_name) "_to_string(uint32_t value, char *buf, size_t bufsize)");
+ printh("{");
+ printh("\tint len = 0;");
+ printh("\tint first = 1;");
+ printh("");
+ printh("\tif (buf == NULL || bufsize == 0)");
+ printh("\t\treturn -1;");
+ printh("");
+ printh("\tbuf[0] = '\\0';");
+ printh("");
+ printh("\tif (value == 0)");
+ printh("\t\treturn 0;");
+ printh("");
+ # Generate flag bit checks
+ for (j = 1; j <= value_count; j++) {
+ value_num = flags[i, "value", j, "num"];
+ value_name = flags[i, "value", j, "name"];
+ printh("\tif (value & " value_num ") {");
+ printh("\t\tif (!first && len < (int)bufsize - 1) {");
+ printh("\t\t\tbuf[len++] = '|';");
+ printh("\t\t}");
+ printh("\t\tfirst = 0;");
+ printh("\t\tif (len < (int)bufsize - 1) {");
+ printh("\t\t\tint n = snprintf(buf + len, bufsize - len, \"" value_name "\");");
+ printh("\t\t\tif (n > 0 && n < (int)(bufsize - len))");
+ printh("\t\t\t\tlen += n;");
+ printh("\t\t}");
+ printh("\t}");
+ }
+ printh("");
+ printh("\tbuf[len] = '\\0';");
+ printh("\treturn len;");
+ printh("}");
+ printh("");
+ }
+
+ # Generate keyword name-to-bitmask lookup function
+ printh("/*");
+ printh(" * Convert a keyword name string to its bitmask value.");
+ printh(" * Returns the keyword bitmask, or 0 if the name is not recognized.");
+ printh(" */");
+ printh("static inline uint32_t");
+ printh(provider_lower "_eventlog_keyword_from_string(const char *name)");
+ printh("{");
+ for (i = 1; i <= nkeywords; i++) {
+ kw_name = keywords[i, "name"];
+ # Resolve the define value we already emitted
+ define_name = toupper(provider) "_EVENTLOG_KEYWORD_" kw_name;
+ printh("\tif (strcasecmp(name, \"" kw_name "\") == 0)");
+ printh("\t\treturn (" define_name ");");
+ }
+ printh("\treturn (0);");
+ printh("}");
+ printh("");
+
+ # Generate formatting functions for userland (elog utility)
+ printh("/*");
+ printh(" * Userland formatting functions for event log parsing");
+ printh(" * These functions format event data into human-readable strings");
+ printh(" */");
+ printh("");
+
+
+ # Generate per-event formatting functions
+ for (i = 1; i <= nevents; i++) {
+ event_name = events[i, "name"];
+ struct_name = events[i, "struct"];
+ format_str = events[i, "format"];
+ event_id = events[i, "id"];
+
+ if (struct_name == "NONE") {
+ # No-payload event: format function takes no evt argument
+ printf_format_escaped = format_str;
+ gsub(/\\/, "\\\\", printf_format_escaped);
+ gsub(/"/, "\\\"", printf_format_escaped);
+ printh("#define " provider_upper "_EVENTLOG_" event_name "_FORMAT \"" printf_format_escaped "\"");
+ printh("");
+ printh("/*");
+ printh(" * Format " provider_upper " " event_name " event to string");
+ printh(" * Returns number of characters written (excluding null terminator),");
+ printh(" * or -1 on error");
+ printh(" */");
+ printh("static inline int");
+ printh(provider_lower "_eventlog_format_" tolower(event_name) "(char *buf, size_t bufsize)");
+ printh("{");
+ printh("\treturn snprintf(buf, bufsize, " provider_upper "_EVENTLOG_" event_name "_FORMAT);");
+ printh("}");
+ printh("");
+ continue;
+ }
+
+ # Find the struct definition
+ struct_idx = 0;
+ for (j = 1; j <= nstructs; j++) {
+ if (structs[j, "name"] == struct_name) {
+ struct_idx = j;
+ break;
+ }
+ }
+
+ if (struct_idx == 0) {
+ die("Struct " struct_name " not found for event " event_name);
+ }
+
+ struct_type_name = provider_lower "_eventlog_" tolower(struct_name);
+ field_count = structs[struct_idx, "field_count"];
+
+ # Parse format string to find positional placeholders (%1, %2, etc.)
+ # and build mapping from placeholder index to field index
+ placeholder_count = 0;
+ delete placeholder_to_field;
+ # Extract all %N placeholders from format string
+ format_copy = format_str;
+ while (match(format_copy, /%[0-9]+/)) {
+ placeholder_num = substr(format_copy, RSTART + 1, RLENGTH - 1) + 0; # Extract number, convert to int
+ if (placeholder_num > 0 && placeholder_num <= field_count) {
+ # Varlen fields cannot be referenced from a format string --
+ # there is no single printf specifier for a variable-length
+ # array. Use the generated accessor helper at runtime instead.
+ if (structs[struct_idx, "field", placeholder_num, "is_varlen"] == 1) {
+ die("Event " event_name " format references varlen field (%s); use the generated accessor helper instead", \
+ structs[struct_idx, "field", placeholder_num, "name"]);
+ }
+ placeholder_count++;
+ placeholder_to_field[placeholder_count] = placeholder_num;
+ } else {
+ die("Invalid placeholder %" placeholder_num " in format string for event " event_name " (field count is " field_count ")");
+ }
+ format_copy = substr(format_copy, RSTART + RLENGTH);
+ }
+
+ # If no placeholders found, assume old-style format (all fields in order)
+ # but only when the format string contains % (e.g. "Value: %1"). Events with
+ # no format args (e.g. "Timer canceled", "Session ended") use no-args path.
+ # Skip varlen fields -- they cannot be printf-formatted inline.
+ if (placeholder_count == 0 && index(format_str, "%") > 0 && event_name != "SESSION_END") {
+ for (j = 1; j <= field_count; j++) {
+ if (structs[struct_idx, "field", j, "is_varlen"] == 1)
+ continue;
+ placeholder_count++;
+ placeholder_to_field[placeholder_count] = j;
+ }
+ }
+
+ # Build printf format string by replacing %N placeholders with actual format specifiers
+ # (Do this early so we can generate the format constant before the function)
+ printf_format = format_str;
+ # Process placeholders in reverse order to avoid replacing parts of already-replaced placeholders
+ # Build a sorted list of unique field indices
+ delete field_indices;
+ field_idx_count = 0;
+ for (j = 1; j <= placeholder_count; j++) {
+ field_idx = placeholder_to_field[j];
+ found = 0;
+ for (k = 1; k <= field_idx_count; k++) {
+ if (field_indices[k] == field_idx) {
+ found = 1;
+ break;
+ }
+ }
+ if (!found) {
+ field_idx_count++;
+ field_indices[field_idx_count] = field_idx;
+ }
+ }
+ # Sort field indices in descending order for replacement
+ for (j = 1; j < field_idx_count; j++) {
+ for (k = j + 1; k <= field_idx_count; k++) {
+ if (field_indices[j] < field_indices[k]) {
+ tmp = field_indices[j];
+ field_indices[j] = field_indices[k];
+ field_indices[k] = tmp;
+ }
+ }
+ }
+ # Replace placeholders with format specifiers (largest first to avoid partial matches)
+ for (j = 1; j <= field_idx_count; j++) {
+ field_idx = field_indices[j];
+ field_type = structs[struct_idx, "field", field_idx, "type"];
+ enum_type = structs[struct_idx, "field", field_idx, "enum_type"];
+ flag_type = structs[struct_idx, "field", field_idx, "flag_type"];
+ hex_format = structs[struct_idx, "field", field_idx, "hex_format"];
+ format_spec = get_printf_format(field_type, enum_type, flag_type, hex_format);
+ # Add "0x" prefix for hex fields
+ if (hex_format) {
+ format_spec = "0x" format_spec;
+ }
+ # Replace %N with the format specifier
+ placeholder_str = "%" field_idx;
+ gsub(placeholder_str, format_spec, printf_format);
+ }
+
+ # Generate the format string constant with printf specifiers (before the function)
+ # Escape quotes and backslashes in the format string for C string literal
+ # Note: % signs are preserved as-is (they're part of printf format specifiers)
+ printf_format_escaped = printf_format;
+ gsub(/\\/, "\\\\", printf_format_escaped); # Escape backslashes
+ gsub(/"/, "\\\"", printf_format_escaped); # Escape quotes
+ printh("#define " provider_upper "_EVENTLOG_" event_name "_FORMAT \"" printf_format_escaped "\"");
+ printh("");
+
+ # Generate formatting function for this event
+ printh("/*");
+ printh(" * Format " provider_upper " " event_name " event to string");
+ printh(" * Returns number of characters written (excluding null terminator),");
+ printh(" * or -1 on error");
+ printh(" */");
+ printh("static inline int");
+ printh(provider_lower "_eventlog_format_" tolower(event_name) "(const struct " struct_type_name " *evt, size_t payload_size, char *buf, size_t bufsize)");
+ printh("{");
+ printh("\tint ret;");
+ printh("\t(void)payload_size; /* may be unused for fixed-size events */");
+ printh("");
+
+ # Determine which fields are actually used in the format string
+ delete fields_used;
+ for (j = 1; j <= placeholder_count; j++) {
+ field_idx = placeholder_to_field[j];
+ fields_used[field_idx] = 1;
+ }
+
+ # Generate enum/flag/IP lookups only for fields that are used
+ needs_lookup = 0;
+ for (j = 1; j <= field_count; j++) {
+ if (!fields_used[j])
+ continue;
+ field_type = structs[struct_idx, "field", j, "type"];
+ if (structs[struct_idx, "field", j, "enum_type"] != "" || structs[struct_idx, "field", j, "flag_type"] != "") {
+ needs_lookup = 1;
+ }
+ if (field_type == "in_addr_t" || field_type == "struct in_addr" || field_type == "in6_addr_t" || field_type == "struct in6_addr") {
+ needs_lookup = 1;
+ }
+ }
+
+ # Generate enum/flag/IP lookups and convert to strings (only for used fields)
+ for (j = 1; j <= field_count; j++) {
+ if (!fields_used[j])
+ continue;
+ field_name = structs[struct_idx, "field", j, "name"];
+ field_type = structs[struct_idx, "field", j, "type"];
+ enum_type = structs[struct_idx, "field", j, "enum_type"];
+ flag_type = structs[struct_idx, "field", j, "flag_type"];
+
+ if (enum_type != "") {
+ printh("\tconst char *" field_name "_str = " provider_lower "_eventlog_enum_" tolower(enum_type) "_to_string(evt->" field_name ");");
+ printh("\tchar " field_name "_val[32];");
+ printh("\tif (" field_name "_str == NULL)");
+ printh("\t\tsnprintf(" field_name "_val, sizeof(" field_name "_val), \"%u\", evt->" field_name ");");
+ } else if (flag_type != "") {
+ printh("\tchar " field_name "_buf[128];");
+ printh("\tint " field_name "_len = " provider_lower "_eventlog_flag_" tolower(flag_type) "_to_string(evt->" field_name ", " field_name "_buf, sizeof(" field_name "_buf));");
+ printh("\tchar " field_name "_val[32];");
+ printh("\tif (" field_name "_len == 0)");
+ printh("\t\tsnprintf(" field_name "_val, sizeof(" field_name "_val), \"%u\", evt->" field_name ");");
+ } else if (field_type == "in_addr_t" || field_type == "struct in_addr") {
+ printh("\tchar " field_name "_str[INET_ADDRSTRLEN];");
+ printh("\tif (inet_ntop(AF_INET, &evt->" field_name ", " field_name "_str, sizeof(" field_name "_str)) == NULL)");
+ printh("\t\tstrcpy(" field_name "_str, \"<invalid>\");");
+ } else if (field_type == "in6_addr_t" || field_type == "struct in6_addr") {
+ printh("\tchar " field_name "_str[INET6_ADDRSTRLEN];");
+ printh("\tif (inet_ntop(AF_INET6, &evt->" field_name ", " field_name "_str, sizeof(" field_name "_str)) == NULL)");
+ printh("\t\tstrcpy(" field_name "_str, \"<invalid>\");");
+ }
+ }
+
+ if (needs_lookup) {
+ printh("");
+ }
+
+ # Note: Format string constant was already generated above before the function
+ # printf_format variable is already set with the converted format string
+
+ # Build argument list in the order placeholders appear in format string
+ arg_list = "";
+ for (j = 1; j <= placeholder_count; j++) {
+ field_idx = placeholder_to_field[j];
+ field_name = structs[struct_idx, "field", field_idx, "name"];
+ field_type = structs[struct_idx, "field", field_idx, "type"];
+ enum_type = structs[struct_idx, "field", field_idx, "enum_type"];
+ flag_type = structs[struct_idx, "field", field_idx, "flag_type"];
+
+ if (arg_list != "")
+ arg_list = arg_list ", ";
+
+ if (enum_type != "") {
+ # Use enum string if available, otherwise use formatted number
+ arg_list = arg_list "(" field_name "_str != NULL ? " field_name "_str : " field_name "_val)";
+ } else if (flag_type != "") {
+ # Use flag string if available, otherwise use formatted number
+ arg_list = arg_list "(" field_name "_len > 0 ? " field_name "_buf : " field_name "_val)";
+ } else if (field_type == "in_addr_t" || field_type == "struct in_addr") {
+ # Use formatted IP address string
+ arg_list = arg_list field_name "_str";
+ } else if (field_type == "in6_addr_t" || field_type == "struct in6_addr") {
+ # Use formatted IPv6 address string (works with or without INET6)
+ arg_list = arg_list field_name "_str";
+ } else if (structs[struct_idx, "field", field_idx, "ntohs"] == 1) {
+ # Network-to-host byte order conversion
+ arg_list = arg_list "ntohs(evt->" field_name ")";
+ } else {
+ # Direct field access
+ arg_list = arg_list "evt->" field_name;
+ }
+ }
+
+ # Handle empty format strings (no placeholders)
+ # Note: Format string constant was already generated above before the function
+ if (placeholder_count > 0) {
+ printh("\tret = snprintf(buf, bufsize, " provider_upper "_EVENTLOG_" event_name "_FORMAT, " arg_list ");");
+ } else {
+ printh("\t(void)evt; /* Unused for empty format */");
+ printh("\tret = snprintf(buf, bufsize, " provider_upper "_EVENTLOG_" event_name "_FORMAT);");
+ }
+ printh("\treturn ret;");
+ printh("}");
+ printh("");
+ }
+
+ # Generate generic formatting function that formats payload based on event ID
+ printh("/*");
+ printh(" * Format an event payload to string");
+ printh(" * payload: Pointer to event payload data");
+ printh(" * payload_size: Size of the payload");
+ printh(" * event_id: Event ID to determine which formatter to use");
+ printh(" * buf: Output buffer");
+ printh(" * bufsize: Size of output buffer");
+ printh(" * Returns number of characters written, or -1 on error");
+ printh(" */");
+ printh("static inline int");
+ printh(provider_lower "_eventlog_format_payload(const void *payload, size_t payload_size, uint32_t event_id, char *buf, size_t bufsize)");
+ printh("{");
+ printh("\t(void)payload_size; /* May be unused depending on event */");
+ printh("\tif (buf == NULL || bufsize == 0)");
+ printh("\t\treturn -1;");
+ printh("\tif (payload == NULL && payload_size > 0)");
+ printh("\t\treturn -1;");
+ printh("\t");
+ printh("\tswitch (event_id) {");
+
+ for (i = 1; i <= nevents; i++) {
+ event_name = events[i, "name"];
+ struct_name = events[i, "struct"];
+ event_id = events[i, "id"];
+
+ printh("\tcase " event_id ":");
+ if (struct_name == "NONE") {
+ printh("\t\treturn " provider_lower "_eventlog_format_" tolower(event_name) "(buf, bufsize);");
+ } else {
+ struct_type_name = provider_lower "_eventlog_" tolower(struct_name);
+ if (event_name == "SESSION_CREATE") {
+ printh("\t\tif (payload_size == 0)");
+ printh("\t\t\treturn snprintf(buf, bufsize, \"Session created\");");
+ }
+ printh("\t\treturn " provider_lower "_eventlog_format_" tolower(event_name) "((const struct " struct_type_name " *)payload, payload_size, buf, bufsize);");
+ }
+ }
+
+ printh("\tdefault:");
+ printh("\t\treturn snprintf(buf, bufsize, \"[UNKNOWN_EVENT_ID:%u]\", event_id);");
+ printh("\t}");
+ printh("}");
+ printh("");
+
+ # Generate event ID to name lookup function
+ printh("/*");
+ printh(" * Map event ID to event name string");
+ printh(" * Returns event name (e.g. \"IN\", \"OUT\") or NULL if unknown");
+ printh(" */");
+ printh("static inline const char *");
+ printh(provider_lower "_eventlog_event_id_to_name(uint32_t event_id)");
+ printh("{");
+ printh("\tswitch (event_id) {");
+
+ for (i = 1; i <= nevents; i++) {
+ event_name = events[i, "name"];
+ event_id = events[i, "id"];
+ printh("\tcase " event_id ": return \"" event_name "\";");
+ }
+
+ printh("\tdefault: return NULL;");
+ printh("\t}");
+ printh("}");
+ printh("");
+ }
+
+ printh("#endif /* _" provider_upper "_EVENTLOG_H_ */");
+}
+
diff --git a/include/eventlog/test_eventlog_schema.src b/include/eventlog/test_eventlog_schema.src
new file mode 100644
--- /dev/null
+++ b/include/eventlog/test_eventlog_schema.src
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2026 Netflix, Inc.
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+/*
+ * This file defines the schema for test event logging.
+ * It is processed by eventlog_gen.awk to generate the event log header
+ * for testing purposes.
+ */
+
+PROVIDER test
+
+KEYWORD BASIC 1
+KEYWORD ADVANCED 2
+KEYWORD COMPLEX 4
+
+ENUM test_status
+ 0:INIT
+ 1:RUNNING
+ 2:SUCCESS
+ 3:FAILED
+
+FLAG test_flags
+ 0x01:FLAG_A
+ 0x02:FLAG_B
+ 0x04:FLAG_C
+ 0x08:FLAG_D
+
+STRUCT SIMPLE_EVENT
+ value:uint32_t
+STRUCT STATUS_EVENT
+ id:uint64_t
+ status:uint8_t:enum_test_status
+STRUCT FLAGS_EVENT
+ id:uint64_t
+ flags:uint32_t:flag_test_flags
+STRUCT COMPLEX_EVENT
+ id:uint64_t
+ value:uint32_t
+ status:uint8_t:enum_test_status
+ flags:uint32_t:flag_test_flags
+ counter:int32_t
+STRUCT VARLEN_EVENT
+ id:uint64_t
+ count:uint8_t
+ values:uint64_t[count:8]
+
+EVENT SIMPLE_EVENT 0 INFO BASIC SIMPLE_EVENT
+ "Simple event: value=%u"
+EVENT STATUS_EVENT 1 INFO BASIC STATUS_EVENT
+ "[%lu] Status event: status=%s"
+EVENT FLAGS_EVENT 2 VERBOSE ADVANCED FLAGS_EVENT
+ "[%lu] Flags event: flags=%s"
+EVENT COMPLEX_EVENT 3 WARN COMPLEX COMPLEX_EVENT
+ "[%lu] Complex event: value=%u status=%s flags=%s counter=%d"
+EVENT VARLEN_EVENT 4 INFO BASIC VARLEN_EVENT
+ "[%1] Varlen event: count=%2"
diff --git a/share/man/man5/Makefile b/share/man/man5/Makefile
--- a/share/man/man5/Makefile
+++ b/share/man/man5/Makefile
@@ -9,6 +9,7 @@
dir.5 \
disktab.5 \
elf.5 \
+ elog.5 \
ethers.5 \
eui64.5 \
fbtab.5 \
diff --git a/share/man/man5/elog.5 b/share/man/man5/elog.5
new file mode 100644
--- /dev/null
+++ b/share/man/man5/elog.5
@@ -0,0 +1,229 @@
+.\"
+.\" Copyright (c) 2026 Netflix, Inc.
+.\"
+.\" SPDX-License-Identifier: BSD-2-Clause
+.\"
+.Dd March 19, 2026
+.Dt ELOG 5
+.Os
+.Sh NAME
+.Nm elog
+.Nd eventlog binary capture file format
+.Sh DESCRIPTION
+An
+.Nm
+file stores a captured sequence of kernel events produced by the
+.Xr eventlog 9
+framework.
+Files are created by the
+.Xr elog 1
+utility using the
+.Fl o
+option and can be read back with the
+.Fl r
+option.
+.Pp
+All multi-byte integer fields are in host-native byte order
+.Pq little-endian on amd64 and aarch64 .
+.Pp
+An
+.Nm
+file consists of three contiguous sections:
+.Bd -literal -offset indent
+[file header (40 bytes)]
+[provider table (4 + N*34 bytes)]
+[event stream]
+.Ed
+.Ss File Header
+The file header is 40 bytes, packed with no padding:
+.Bl -column "Offset" "Size" "uint64_t" "dropped_events" -offset indent
+.It Sy Offset Ta Sy Size Ta Sy Type Ta Sy Field
+.It 0 Ta 4 Ta Vt char[4] Ta Va magic
+.It 4 Ta 4 Ta Vt uint32_t Ta Va version
+.It 8 Ta 8 Ta Vt uint64_t Ta Va capture_start
+.It 16 Ta 8 Ta Vt uint64_t Ta Va start_utc_us
+.It 24 Ta 8 Ta Vt uint64_t Ta Va event_count
+.It 32 Ta 8 Ta Vt uint64_t Ta Va dropped_events
+.El
+.Pp
+The fields are:
+.Bl -tag -width indent
+.It Va magic
+The four ASCII bytes
+.Dq ELOG
+.Pq Li 0x45 0x4C 0x4F 0x47 .
+A consumer must verify this field before proceeding.
+.It Va version
+Format version number.
+The current version is 1.
+Unknown versions should be rejected.
+.It Va capture_start
+Timestamp of the first captured event, in microseconds since boot.
+.It Va start_utc_us
+UTC wall-clock time at capture start, in microseconds since the Unix epoch.
+Used together with
+.Va capture_start
+to convert event timestamps to UTC:
+.Bd -literal -offset indent
+utc_us = start_utc_us + (event.timestamp - capture_start)
+.Ed
+.It Va event_count
+Total number of events in the file.
+Written as 0 initially and updated by seeking back to the header when
+capture ends.
+May be 0 if the capturing tool was killed.
+.It Va dropped_events
+Number of events the kernel dropped due to full buffers during capture.
+Also updated when capture ends.
+.El
+.Pp
+The C structure is:
+.Bd -literal -offset indent
+struct elog_binary_header {
+ char magic[4];
+ uint32_t version;
+ uint64_t capture_start;
+ uint64_t start_utc_us;
+ uint64_t event_count;
+ uint64_t dropped_events;
+} __packed;
+.Ed
+.Ss Provider Table
+Immediately after the file header, the provider table maps numeric
+.Va provider_id
+values found in event headers to human-readable provider names.
+.Pp
+The table starts with a 4-byte count:
+.Bl -column "Offset" "Size" "uint32_t" -offset indent
+.It Sy Offset Ta Sy Size Ta Sy Type Ta Sy Field
+.It 0 Ta 4 Ta Vt uint32_t Ta Va count
+.El
+.Pp
+This is followed by
+.Va count
+entries, each 34 bytes and packed:
+.Bl -column "Offset" "Size" "uint16_t" "provider_id" -offset indent
+.It Sy Offset Ta Sy Size Ta Sy Type Ta Sy Field
+.It 0 Ta 2 Ta Vt uint16_t Ta Va provider_id
+.It 2 Ta 32 Ta Vt char[32] Ta Va name
+.El
+.Pp
+The
+.Va name
+field is null-terminated and zero-padded to 32 bytes.
+Multiple entries may share the same name when multiple kernel subsystems
+register providers under the same name
+.Pq see Xr eventlog 9 .
+The maximum number of entries is 32
+.Pq Dv EVENTLOG_MAX_PROVIDERS .
+.Pp
+The total provider table size is
+.Li 4 + count * 34
+bytes.
+.Pp
+The C structure for each entry is:
+.Bd -literal -offset indent
+struct eventlog_provider_info {
+ uint16_t provider_id;
+ char name[EVENTLOG_PROVIDER_NAME_MAX];
+} __packed;
+.Ed
+.Ss Event Stream
+The remainder of the file is a packed sequence of events in timestamp
+order.
+Each event consists of a 32-byte header followed by a variable-length
+payload:
+.Bd -literal -offset indent
+[header (32 bytes)][payload (0+ bytes)]
+.Ed
+.Pp
+Events are packed back-to-back with no inter-event padding.
+The event header layout is:
+.Bl -column "Offset" "Size" "uint64_t" "event_length" -offset indent
+.It Sy Offset Ta Sy Size Ta Sy Type Ta Sy Field
+.It 0 Ta 2 Ta Vt uint16_t Ta Va event_length
+.It 2 Ta 2 Ta Vt uint16_t Ta Va cpu
+.It 4 Ta 2 Ta Vt uint16_t Ta Va provider_id
+.It 6 Ta 2 Ta Vt uint16_t Ta (reserved)
+.It 8 Ta 8 Ta Vt uint64_t Ta Va timestamp
+.It 16 Ta 8 Ta Vt uint64_t Ta Va session_id
+.It 24 Ta 4 Ta Vt uint32_t Ta Va event_id
+.It 28 Ta 4 Ta Vt int32_t Ta Va thread_id
+.El
+.Pp
+The
+.Va event_length
+field gives the total event size in bytes, including the header.
+The minimum value is 32
+.Pq header only, no payload ;
+the maximum is 65535.
+A parser advances to the next event by adding
+.Va event_length
+to the current event's offset.
+.Pp
+The
+.Va provider_id
+maps to a provider name via the provider table.
+The
+.Va timestamp
+is in microseconds since boot; convert to UTC using the formula
+described in the file header section.
+The
+.Va session_id
+is provider-defined.
+The reserved field at offset 6 is written as zero and must be ignored.
+.Pp
+Two
+.Va event_id
+values are reserved across all providers:
+.Bl -column "EVENTLOG_SESSION_CREATE_ID" "0xFFFFFFFE" -offset indent
+.It Sy Constant Ta Sy Value Ta Sy Meaning
+.It Dv EVENTLOG_SESSION_CREATE_ID Ta Li 0xFFFFFFFE Ta Session created
+.It Dv EVENTLOG_SESSION_END_ID Ta Li 0xFFFFFFFF Ta Session destroyed
+.El
+.Pp
+The
+.Dv EVENTLOG_SESSION_END_ID
+event has an empty payload
+.Pq 0 bytes .
+All other payload formats are provider-specific and defined by schema files.
+The payload size is
+.Va event_length
+minus 32.
+.Pp
+The event stream may end with an incomplete event at EOF if the capture
+was interrupted.
+Parsers should stop when fewer than 32 bytes remain or when
+.Va event_length
+exceeds the remaining data.
+.Ss Compressed Files
+The
+.Xr elog 1
+utility also accepts
+.Pa .gz Ns -compressed
+files
+.Pq detected by file extension .
+The decompressed content has the same format described above.
+.Sh EXAMPLES
+Capture TCP events to a binary file:
+.Bd -literal -offset indent
+elog -c tcp -o /tmp/events.elog
+.Ed
+.Pp
+Read back as formatted text:
+.Bd -literal -offset indent
+elog -r /tmp/events.elog
+.Ed
+.Pp
+Read a compressed capture:
+.Bd -literal -offset indent
+elog -r /tmp/events.elog.gz
+.Ed
+.Sh SEE ALSO
+.Xr elog 1 ,
+.Xr eventlog 9
+.Sh HISTORY
+The
+.Nm
+file format first appeared in
+.Fx 16.0 .
diff --git a/share/man/man9/Makefile b/share/man/man9/Makefile
--- a/share/man/man9/Makefile
+++ b/share/man/man9/Makefile
@@ -147,6 +147,7 @@
ecn.9 \
efirt.9 \
epoch.9 \
+ eventlog.9 \
ether_gen_addr.9 \
EVENTHANDLER.9 \
eventtimers.9 \
@@ -1123,6 +1124,30 @@
epoch.9 epoch_call.9 \
epoch.9 epoch_drain_callbacks.9 \
epoch.9 in_epoch.9
+MLINKS+=eventlog.9 eventlog_provider_create.9 \
+ eventlog.9 eventlog_provider_destroy.9 \
+ eventlog.9 eventlog_provider_get_level.9 \
+ eventlog.9 eventlog_provider_get_keywords.9 \
+ eventlog.9 eventlog_provider_get_default.9 \
+ eventlog.9 eventlog_provider_set_default.9 \
+ eventlog.9 eventlog_provider_get_sysctl_node.9 \
+ eventlog.9 eventlog_provider_get_sysctl_ctx.9 \
+ eventlog.9 eventlog_session_create.9 \
+ eventlog.9 eventlog_session_destroy.9 \
+ eventlog.9 eventlog_session_set_enabled.9 \
+ eventlog.9 eventlog_session_is_enabled.9 \
+ eventlog.9 eventlog_session_set_filter.9 \
+ eventlog.9 eventlog_event_write.9 \
+ eventlog.9 eventlog_event_write_at.9 \
+ eventlog.9 eventlog_event_write_gather.9 \
+ eventlog.9 eventlog_event_write_gather_at.9 \
+ eventlog.9 eventlog_subscriber_create_device.9 \
+ eventlog.9 eventlog_subscriber_create_callback.9 \
+ eventlog.9 eventlog_subscriber_destroy.9 \
+ eventlog.9 eventlog_subscriber_add_subscription.9 \
+ eventlog.9 eventlog_subscriber_drain_dumps.9 \
+ eventlog.9 eventlog_subscriber_read.9 \
+ eventlog.9 eventlog_subscriber_get_stats.9
MLINKS+=EVENTHANDLER.9 EVENTHANDLER_DECLARE.9 \
EVENTHANDLER.9 EVENTHANDLER_DEFINE.9 \
EVENTHANDLER.9 EVENTHANDLER_DEREGISTER.9 \
diff --git a/share/man/man9/eventlog.9 b/share/man/man9/eventlog.9
new file mode 100644
--- /dev/null
+++ b/share/man/man9/eventlog.9
@@ -0,0 +1,1097 @@
+.\"
+.\" Copyright (c) 2026 Netflix, Inc.
+.\"
+.\" SPDX-License-Identifier: BSD-2-Clause
+.\"
+.Dd May 11, 2026
+.Dt EVENTLOG 9
+.Os
+.Sh NAME
+.Nm eventlog
+.Nd subscription-based kernel event logging framework
+.Sh SYNOPSIS
+.In sys/eventlog.h
+.Ft "struct eventlog_provider *"
+.Fo eventlog_provider_create
+.Fa "const char *name"
+.Fa "eventlog_provider_dump_state_t dump_callback"
+.Fa "void *dump_callback_arg"
+.Fa "eventlog_default_changed_t default_changed"
+.Fa "void *default_changed_arg"
+.Fc
+.Ft void
+.Fo eventlog_provider_destroy
+.Fa "struct eventlog_provider *provider"
+.Fc
+.Ft "enum eventlog_level"
+.Fo eventlog_provider_get_level
+.Fa "struct eventlog_provider *provider"
+.Fc
+.Ft uint32_t
+.Fo eventlog_provider_get_keywords
+.Fa "struct eventlog_provider *provider"
+.Fc
+.Ft int
+.Fo eventlog_provider_get_default
+.Fa "struct eventlog_provider *provider"
+.Fc
+.Ft void
+.Fo eventlog_provider_set_default
+.Fa "struct eventlog_provider *provider"
+.Fa "int value"
+.Fc
+.Ft "struct sysctl_oid *"
+.Fo eventlog_provider_get_sysctl_node
+.Fa "struct eventlog_provider *provider"
+.Fc
+.Ft "struct sysctl_ctx_list *"
+.Fo eventlog_provider_get_sysctl_ctx
+.Fa "struct eventlog_provider *provider"
+.Fc
+.Ft "struct eventlog_session *"
+.Fo eventlog_session_create
+.Fa "struct eventlog_provider *provider"
+.Fa "uint64_t session_id"
+.Fa "bool waitok"
+.Fa "void *create_payload"
+.Fa "size_t create_payload_size"
+.Fc
+.Ft void
+.Fo eventlog_session_destroy
+.Fa "struct eventlog_session *session"
+.Fc
+.Ft void
+.Fo eventlog_session_set_enabled
+.Fa "struct eventlog_session *session"
+.Fa "int enabled"
+.Fc
+.Ft int
+.Fo eventlog_session_is_enabled
+.Fa "struct eventlog_session *session"
+.Fc
+.Ft void
+.Fo eventlog_session_set_filter
+.Fa "struct eventlog_session *session"
+.Fa "enum eventlog_level level"
+.Fa "uint32_t keywords"
+.Fc
+.Ft void
+.Fo eventlog_event_write
+.Fa "struct eventlog_session *session"
+.Fa "uint32_t id"
+.Fa "enum eventlog_level level"
+.Fa "uint32_t keywords"
+.Fa "void *buffer"
+.Fa "size_t length"
+.Fc
+.Ft void
+.Fo eventlog_event_write_at
+.Fa "struct eventlog_session *session"
+.Fa "uint32_t id"
+.Fa "enum eventlog_level level"
+.Fa "uint32_t keywords"
+.Fa "void *buffer"
+.Fa "size_t length"
+.Fa "uint64_t timestamp_us"
+.Fc
+.Ft void
+.Fo eventlog_event_write_gather
+.Fa "struct eventlog_session *session"
+.Fa "uint32_t id"
+.Fa "enum eventlog_level level"
+.Fa "uint32_t keywords"
+.Fa "const struct iovec *iov"
+.Fa "int iovcnt"
+.Fc
+.Ft void
+.Fo eventlog_event_write_gather_at
+.Fa "struct eventlog_session *session"
+.Fa "uint32_t id"
+.Fa "enum eventlog_level level"
+.Fa "uint32_t keywords"
+.Fa "const struct iovec *iov"
+.Fa "int iovcnt"
+.Fa "uint64_t timestamp_us"
+.Fc
+.In sys/eventlog_subscriber.h
+.Ft "struct eventlog_subscriber *"
+.Fo eventlog_subscriber_create_device
+.Fa "uint32_t buffer_size_per_cpu"
+.Fc
+.Ft "struct eventlog_subscriber *"
+.Fo eventlog_subscriber_create_callback
+.Fa "eventlog_callback_t callback"
+.Fa "void *callback_arg"
+.Fc
+.Ft void
+.Fo eventlog_subscriber_destroy
+.Fa "struct eventlog_subscriber *subscriber"
+.Fc
+.Ft int
+.Fo eventlog_subscriber_add_subscription
+.Fa "struct eventlog_subscriber *subscriber"
+.Fa "const char *provider_name"
+.Fa "enum eventlog_level level"
+.Fa "uint32_t keywords"
+.Fc
+.Ft void
+.Fo eventlog_subscriber_drain_dumps
+.Fa "struct eventlog_subscriber *subscriber"
+.Fc
+.Ft int
+.Fo eventlog_subscriber_read
+.Fa "struct eventlog_subscriber *subscriber"
+.Fa "struct uio *uio"
+.Fa "int flags"
+.Fc
+.Ft void
+.Fo eventlog_subscriber_get_stats
+.Fa "struct eventlog_subscriber *subscriber"
+.Fa "struct eventlog_stats *stats"
+.Fc
+.Sh DESCRIPTION
+The
+.Nm
+framework provides a subscription-based event logging system for the
+.Fx
+kernel.
+It enables kernel subsystems
+.Pq providers
+to emit structured events that can be captured by user-space tools or
+kernel callbacks
+.Pq subscribers
+with fine-grained filtering based on provider name, keyword bitmask, and
+log level.
+.Pp
+A single character device,
+.Pa /dev/eventlog ,
+handles all event subscriptions and delivery.
+Providers are only enabled when active subscribers exist, ensuring zero
+overhead when no one is listening.
+.Pp
+The framework is designed around the following principles:
+.Bl -bullet -compact
+.It
+Single system-wide device for all providers and subscribers.
+.It
+Subscription-based model with zero overhead when idle.
+.It
+Multi-provider support: multiple subsystems register under the same name
+and subscriptions automatically cover all matching providers.
+.It
+Per-subscriber filtering by log level and keyword bitmask.
+.It
+Lock-free per-CPU double-buffering for device subscribers.
+.It
+Timestamp-ordered event delivery across CPUs.
+.El
+.Ss Log Levels
+Events are classified by severity using the
+.Vt "enum eventlog_level"
+type.
+Higher numeric values are more verbose:
+.Bl -column "EVENTLOG_LEVEL_VERBOSE" "Value" -offset indent
+.It Sy Constant Ta Sy Value
+.It Dv EVENTLOG_LEVEL_NONE Ta 0
+.It Dv EVENTLOG_LEVEL_ERROR Ta 1
+.It Dv EVENTLOG_LEVEL_WARN Ta 2
+.It Dv EVENTLOG_LEVEL_INFO Ta 3
+.It Dv EVENTLOG_LEVEL_VERBOSE Ta 4
+.It Dv EVENTLOG_LEVEL_TRACE Ta 5
+.El
+.Pp
+Subscribing at a given level receives all events at that level and below
+.Pq less verbose .
+For example, subscribing at
+.Dv EVENTLOG_LEVEL_VERBOSE
+receives ERROR, WARN, INFO, and VERBOSE events but not TRACE.
+.Ss Providers
+A provider is a kernel subsystem that emits events.
+Providers are created with
+.Fn eventlog_provider_create ,
+which registers the provider under a given
+.Fa name
+and assigns it a unique
+.Va provider_id
+.Pq 1-based .
+The
+.Va provider_id
+is embedded in every event header.
+.Pp
+Multiple subsystems may register providers with the same name.
+For example, both the default TCP stack and the RACK TCP stack register
+providers named
+.Dq tcp :
+.Bd -literal -offset indent
+/* Default TCP stack */
+provider = eventlog_provider_create("tcp",
+ tcp_eventlog_dump_state, NULL,
+ tcp_eventlog_default_changed, NULL);
+
+/* RACK TCP stack */
+provider = eventlog_provider_create("tcp",
+ tcp_eventlog_dump_state, NULL,
+ tcp_eventlog_default_changed, NULL);
+.Ed
+.Pp
+When a subscriber subscribes by name, the subscription is applied to
+.Em all
+providers matching that name.
+Each provider instance gets its own unique
+.Va provider_id ,
+allowing consumers to distinguish which instance emitted a given event.
+.Pp
+The
+.Fn eventlog_provider_destroy
+function unregisters a provider.
+It synchronizes with the dump state lock to ensure no dump callback is
+in-flight when the provider is freed.
+.Pp
+The
+.Fn eventlog_provider_get_level
+and
+.Fn eventlog_provider_get_keywords
+functions return the current aggregate level and keyword mask computed
+from all active subscribers.
+These are primarily useful for testing and debugging.
+.Pp
+The
+.Fn eventlog_provider_get_default
+function returns the current default enablement setting for a provider
+.Pq 0 or 1 .
+The
+.Fn eventlog_provider_set_default
+function sets this value programmatically without iterating existing sessions.
+.Ss Shared Statistics
+All providers sharing the same name share a single
+.Vt eventlog_provider_stats
+structure, which owns the
+.Xr sysctl 9
+counters and the
+.Va default
+setting.
+This provides a single aggregated view under
+.Li kern.eventlog. Ns Ao Ar name Ac Ns Li .* :
+.Bl -column "kern.eventlog.<name>.sessions_created" -offset indent
+.It Li kern.eventlog. Ns Ao Ar name Ac Ns Li .sessions_created
+.It Li kern.eventlog. Ns Ao Ar name Ac Ns Li .sessions_active
+.It Li kern.eventlog. Ns Ao Ar name Ac Ns Li .sessions_enabled
+.It Li kern.eventlog. Ns Ao Ar name Ac Ns Li .default
+.El
+.Pp
+The
+.Li default
+sysctl controls whether new sessions start enabled or disabled.
+The initial value is 0
+.Pq disabled ,
+but can be overridden at boot via loader tunable
+.Pq e.g., Li kern.eventlog.tcp.default=1 No in Pa /boot/loader.conf
+or programmatically via
+.Fn eventlog_provider_set_default .
+It accepts four values:
+.Bl -tag -width indent
+.It 0
+New sessions start disabled.
+Sessions can be individually enabled later.
+.It 1
+New sessions start enabled.
+.It \-1
+Disable all currently active sessions across all providers with this
+name, then set the default to 0.
+For providers that register a
+.Fa default_changed
+callback, the framework skips the session iteration and defers it to
+the callback.
+.It 2
+Enable all currently disabled sessions across all providers with this
+name, then set the default to 1.
+For providers that register a
+.Fa default_changed
+callback, the framework skips the session iteration and defers it to
+the callback.
+.El
+.Pp
+The stats structure is reference-counted: created when the first provider
+with a given name is registered, shared when subsequent same-named
+providers are created, and freed when the last provider with that name
+is destroyed.
+.Ss Extending The Provider Sysctl Node
+Providers that need a handful of provider-specific knobs
+.Pq e.g., a sampling rate
+can hang additional
+.Xr sysctl 9
+children off the auto-generated
+.Li kern.eventlog. Ns Ao Ar name Ac
+node rather than creating a parallel
+.Li kern. Ns Ao Ar name Ac
+tree.
+Two accessors expose the node and its
+.Vt sysctl_ctx_list
+for this purpose:
+.Pp
+.Bl -tag -width Ds -compact
+.It Fn eventlog_provider_get_sysctl_node
+Returns the
+.Vt "struct sysctl_oid *"
+for
+.Li kern.eventlog. Ns Ao Ar name Ac .
+.It Fn eventlog_provider_get_sysctl_ctx
+Returns the
+.Vt "struct sysctl_ctx_list *"
+owned by the framework for this provider name.
+.El
+.Pp
+Children added via these handles are freed automatically when the last
+provider with the same name is destroyed, so callers should only register
+children whose backing storage is valid for the lifetime of the provider.
+For example, a provider with a tunable sampling rate could publish it as
+.Li kern.eventlog. Ns Ao Ar name Ac Ns Li .hz :
+.Bd -literal -offset indent
+provider = eventlog_provider_create("myprov", &cfg);
+
+SYSCTL_ADD_INT(
+ eventlog_provider_get_sysctl_ctx(provider),
+ SYSCTL_CHILDREN(eventlog_provider_get_sysctl_node(provider)),
+ OID_AUTO, "hz", CTLFLAG_RW, &my_hz, 0,
+ "Samples per CPU per second");
+.Ed
+.Ss Dump State Callback
+When a subscriber subscribes for the first time to a provider that
+registered a
+.Fa dump_callback ,
+the framework schedules an asynchronous dump on its private
+.Dq eventlog_dump
+taskqueue.
+The callback emits current state for all of the provider's existing sessions
+so the new subscriber observes sessions that were created before it started
+listening.
+.Pp
+The callback runs on a dedicated framework taskqueue thread after
+.Fn eventlog_subscriber_add_subscription
+has already returned to the caller, so it must not depend on any per-thread
+state of the subscribing thread (e.g.
+.Va curthread->td_vnet ,
+.Va curthread->td_proc ) ;
+providers that need a vnet context must establish one themselves with
+.Fn VNET_FOREACH /
+.Fn CURVNET_SET .
+The callback should use normal event write functions or schema-generated
+.Ql _LOG
+macros; the framework automatically routes those writes to only the
+subscriber that requested this dump.
+.Pp
+The dump taskqueue is single-threaded, so a provider's
+.Fa dump_callback
+is never invoked concurrently with itself or with any other dump_callback.
+Re-subscribing an already-subscribed provider does
+.Em not
+re-fire the dump (the subscriber already has the state); only a brand-new
+subscription enqueues a task.
+.Pp
+If
+.Fa dump_callback
+is
+.Dv NULL ,
+no task is enqueued.
+With multi-provider support a separate task is enqueued for each matching
+provider when subscribing by name.
+.Pp
+Once
+.Fa dump_callback
+returns the framework synthesises a single
+.Dv EVENTLOG_DUMP_COMPLETE_ID
+event for the requesting subscriber.
+The event carries the dumping provider's id, a session id of zero, and an
+empty payload, and is filtered with the same
+.Dv EVENTLOG_LEVEL_INFO
++
+.Dv EVENTLOG_KEYWORD_SESSION
+contract that gates
+.Dv EVENTLOG_SESSION_CREATE_ID
+and
+.Dv EVENTLOG_SESSION_END_ID :
+subscribers that opted out of the session keyword bit do not receive it.
+Userspace consumers can key on
+.Dv EVENTLOG_DUMP_COMPLETE_ID
+to know that all replay events for the just-subscribed provider have
+been delivered.
+.Pp
+Callers that need to observe a self-consistent post-dump state before
+proceeding (test code, save-state tools) should call
+.Fn eventlog_subscriber_drain_dumps
+after subscribing; it blocks until every dump task this subscriber has
+outstanding has finished.
+.Fn eventlog_subscriber_destroy
+implicitly drains, so callers do not have to coordinate teardown with
+in-flight dumps.
+.Ss Default Changed Callback
+When the
+.Li kern.eventlog. Ns Ao Ar name Ac Ns Li .default
+sysctl is written, the framework invokes the optional
+.Fa default_changed
+callback on each provider that shares the name.
+The callback receives the raw sysctl value:
+.Bl -tag -width indent
+.It 0 No or 1
+Informational only.
+The framework has updated the stored default.
+.It \-1 No or 2
+The framework does
+.Em not
+iterate sessions for providers that have a
+.Fa default_changed
+callback; the provider is responsible for enabling or disabling its own
+sessions.
+For providers without a callback, the framework iterates sessions itself.
+.El
+.Pp
+The callback is invoked outside any eventlog lock; the provider may take
+its own locks.
+.Ss Provider Enablement
+Each provider instance is enabled or disabled independently based on its
+subscribers.
+Keywords are OR'd across all subscribers for that provider instance.
+The level is set to the most verbose level requested by any subscriber.
+For example, if subscriber A requests INFO with keywords 0x1 and
+subscriber B requests WARN with keywords 0x2, the provider is enabled at
+INFO with keywords 0x3.
+.Ss Sessions
+A session represents a single entity being observed, such as one TCP
+connection.
+Sessions are created with
+.Fn eventlog_session_create ,
+which allocates a session from a UMA zone and associates it with a
+provider.
+The
+.Fa session_id
+is a provider-assigned 64-bit identifier
+.Pq e.g., Va inp_gencnt No for TCP
+that is embedded in every event header.
+The meaning of
+.Fa session_id
+values is provider-defined.
+.Pp
+If
+.Fa waitok
+is true, allocations use
+.Dv M_WAITOK ;
+otherwise
+.Dv M_NOWAIT
+is used and the function may return
+.Dv NULL .
+.Pp
+The optional
+.Fa create_payload
+is provider-specific data included in the
+.Dv EVENTLOG_SESSION_CREATE_ID
+event emitted at session creation.
+If
+.Dv NULL ,
+a default payload containing only the creation timestamp is used.
+.Pp
+The session's initial enabled state is derived from the provider's
+.Li default
+sysctl.
+The
+.Dv EVENTLOG_SESSION_CREATE_ID
+event is only emitted when the session is enabled.
+.Pp
+The public session structure exposes two fields for use by the
+schema-generated
+.Ql _ENABLED
+macros:
+.Bd -literal -offset indent
+struct eventlog_session {
+ enum eventlog_level effective_level;
+ uint32_t effective_keywords;
+};
+.Ed
+.Pp
+The
+.Fn eventlog_session_set_enabled
+function enables or disables a session.
+When disabled,
+.Va effective_level
+is set to
+.Dv EVENTLOG_LEVEL_NONE
+so that
+.Ql _ENABLED
+checks fail with no function call overhead.
+.Pp
+The
+.Fn eventlog_session_set_filter
+function sets a per-session level and keyword override.
+When set, effective values use this override instead of the provider's
+aggregate values.
+Call
+.Fn eventlog_session_set_enabled
+after setting the filter to apply it.
+.Pp
+The
+.Fn eventlog_session_destroy
+function destroys a session and emits an
+.Dv EVENTLOG_SESSION_END_ID
+event if the session was enabled.
+.Ss Subscribers
+Subscribers are entities that receive events.
+There are two types:
+.Bl -tag -width indent
+.It Sy Device subscribers
+Created with
+.Fn eventlog_subscriber_create_device .
+These use per-CPU double-buffered storage with configurable buffer size
+.Pq Dv EVENTLOG_BUFFER_SIZE_MIN No to Dv EVENTLOG_BUFFER_SIZE_MAX .
+Events are read via
+.Fn eventlog_subscriber_read ,
+which merges events across CPUs by timestamp using a min-heap.
+Device subscribers can serve both user-space and kernel readers.
+.It Sy Callback subscribers
+Created with
+.Fn eventlog_subscriber_create_callback .
+Events are delivered directly to the callback function with no buffering.
+This provides the lowest latency but the callback must execute quickly
+as it runs in the context of the event writer.
+The callback function type is:
+.Bd -literal -offset indent
+typedef void (*eventlog_callback_t)(
+ const struct eventlog_event_header *hdr,
+ const char *provider_name,
+ uint8_t provider_name_len,
+ uint64_t session_id,
+ const struct iovec *iov,
+ int iovcnt,
+ size_t payload_size,
+ void *callback_arg);
+.Ed
+.El
+.Pp
+The
+.Fn eventlog_subscriber_add_subscription
+function subscribes to a named provider.
+If the provider name matches multiple provider instances, a separate
+internal subscription is created for each, but this is transparent to
+the caller.
+For every newly-subscribed provider that registered a
+.Fa dump_callback
+the framework schedules an asynchronous dump on the eventlog_dump
+taskqueue;
+see
+.Sx Dump State Callback .
+.Pp
+The
+.Fn eventlog_subscriber_drain_dumps
+function blocks the caller until every dump task this subscriber has
+outstanding has finished.
+.Pp
+The
+.Fn eventlog_subscriber_destroy
+function removes all subscriptions, frees resources, and updates
+provider enablement.
+It implicitly drains any pending dumps before freeing memory, so callers
+do not have to coordinate destroy with in-flight dumps.
+.Pp
+The
+.Fn eventlog_subscriber_get_stats
+function fills the
+.Fa stats
+structure with current subscriber statistics including
+.Va dropped_events .
+.Ss Event Writing
+Events are emitted using
+.Fn eventlog_event_write
+or
+.Fn eventlog_event_write_at .
+Both functions construct a 32-byte event header on the stack and deliver
+the event to all subscribers that match the provider, level, and keyword
+criteria.
+.Pp
+The
+.Fn eventlog_event_write_at
+variant accepts a pre-computed timestamp in microseconds since boot,
+for use when the caller has already queried the time.
+.Pp
+In practice, events are typically emitted via schema-generated macros
+rather than calling these functions directly.
+See
+.Sx SCHEMA-BASED CODE GENERATION
+below.
+.Ss Scatter/Gather Event Writing
+For events that carry a variable-length payload, the
+.Fn eventlog_event_write_gather
+and
+.Fn eventlog_event_write_gather_at
+variants accept the payload as an
+.Vt iovec
+of
+.Fa iovcnt
+segments.
+The framework prepends the 32-byte header and copies the concatenation
+of every segment into each subscriber's ring buffer as part of the same
+reserve step, so readers never observe a partially written event.
+A zero segment count or any zero-length entry is legal.
+.Pp
+These variants are the preferred shape when a producer has a fixed head
+followed by a variable-length tail: the caller builds a two-element iov
+referring to its own source buffers directly, avoiding an intermediate
+copy and the worst-case stack footprint of a composite struct.
+.Pp
+The schema generator emits calls to
+.Fn eventlog_event_write_gather
+for STRUCTs that declare a trailing varlen field.
+Scalar callers can keep using
+.Fn eventlog_event_write ;
+internally it builds a one-element iov and takes the same write path,
+so the two entry points deliver byte-identical events on the wire.
+.Pp
+Callback subscribers receive the payload in scatter/gather form: the
+callback signature is
+.Pp
+.Bd -literal -offset indent
+typedef void (*eventlog_callback_t)(
+ const struct eventlog_event_header *hdr,
+ const char *provider_name, uint8_t provider_name_len,
+ uint64_t session_id,
+ const struct iovec *iov, int iovcnt, size_t payload_size,
+ void *callback_arg);
+.Ed
+.Pp
+Delivery happens inside an
+.Xr smr 9
+critical section where
+.Xr malloc 9
+is not permitted, which is why the framework does not compact iov
+segments into a flat buffer.
+For scalar producers
+.Fa iovcnt
+is 1 and the payload is simply
+.Fa iov[0].iov_base .
+For producers that use
+.Fn eventlog_event_write_gather
+(the usual shape for schema-generated varlen events), the callback either
+walks the segments in order or
+.Xr memcpy 3 Ns s
+them into a caller-sized buffer.
+The
+.Fa iov
+and
+.Fa iov[*].iov_base
+pointers are only valid for the duration of the callback; callbacks must
+not retain them.
+.Pp
+Device subscribers are unaffected by this: iov segments are copied
+straight from the caller's buffers into the per-CPU ring, up to the
+wire-format
+.Dv UINT16_MAX
+.Va event_length
+cap.
+.Ss Event Write Path
+The event write path is designed for minimal overhead:
+.Bl -enum -compact
+.It
+The schema-generated
+.Ql _LOG
+macro first calls
+.Ql _ENABLED ,
+which checks the session's
+.Va effective_level
+and
+.Va effective_keywords
+fields directly with no function call.
+If disabled, the macro returns immediately.
+.It
+If enabled, the event structure is initialized on the stack and
+.Fn eventlog_event_write
+is called.
+.It
+The function enters an SMR critical section via
+.Fn smr_enter ,
+which disables thread preemption and pins the thread to the current CPU.
+This rules out thread-level writer-vs-writer contention; a hardware NMI on
+the same CPU can still nest inside the writer (see
+.Sx Per-CPU Buffering ) .
+.It
+All active subscribers are iterated under SMR protection
+.Pq no locks .
+For each matching subscriber, the event is routed by type:
+device subscribers receive a buffer write; callback subscribers receive
+a direct function invocation.
+.It
+The SMR critical section is exited via
+.Fn smr_exit .
+.El
+.Ss Event Format
+Each event is a contiguous byte sequence consisting of a 32-byte header
+followed by a variable-length payload.
+All multi-byte integer fields are in host-native byte order
+.Pq little-endian on amd64 and aarch64 .
+Events are packed back-to-back with no inter-event padding.
+.Pp
+The event header layout is:
+.Bl -column "Offset" "Size" "uint64_t" "event_length" -offset indent
+.It Sy Offset Ta Sy Size Ta Sy Type Ta Sy Field
+.It 0 Ta 2 Ta Vt uint16_t Ta Va event_length
+.It 2 Ta 2 Ta Vt uint16_t Ta Va cpu
+.It 4 Ta 2 Ta Vt uint16_t Ta Va provider_id
+.It 6 Ta 2 Ta Vt uint16_t Ta (reserved)
+.It 8 Ta 8 Ta Vt uint64_t Ta Va timestamp
+.It 16 Ta 8 Ta Vt uint64_t Ta Va session_id
+.It 24 Ta 4 Ta Vt uint32_t Ta Va event_id
+.It 28 Ta 4 Ta Vt int32_t Ta Va thread_id
+.El
+.Pp
+The
+.Va event_length
+field gives the total event size in bytes
+.Pq header + payload .
+The minimum value is 32
+.Pq header only ;
+the maximum is 65535.
+The
+.Va timestamp
+is in microseconds since boot, obtained via
+.Fn binuptime
+and converted with
+.Fn bintime2us .
+The
+.Va session_id
+is provider-defined.
+The
+.Va thread_id
+is the kernel thread ID
+.Pq Vt lwpid_t ;
+0 if no thread context.
+.Pp
+Two
+.Va event_id
+values are reserved for session lifecycle events emitted by the framework:
+.Bl -column "EVENTLOG_SESSION_CREATE_ID" "0xFFFFFFFE" -offset indent
+.It Sy Constant Ta Sy Value Ta Sy Meaning
+.It Dv EVENTLOG_SESSION_CREATE_ID Ta Li 0xFFFFFFFE Ta Session created
+.It Dv EVENTLOG_SESSION_END_ID Ta Li 0xFFFFFFFF Ta Session destroyed
+.El
+.Pp
+These use the keyword
+.Dv EVENTLOG_KEYWORD_SESSION
+.Pq Li 0x80000000
+and level
+.Dv EVENTLOG_LEVEL_INFO .
+The payload format is described in the provider's schema;
+.Dv EVENTLOG_SESSION_END_ID
+has an empty payload.
+.Pp
+The payload immediately follows the header with no padding.
+Its size is
+.Va event_length
+minus 32 bytes and its format is provider-specific, defined by schema files.
+.Ss Per-CPU Buffering
+Device subscribers use per-CPU double-buffering.
+Each CPU has two buffers per subscriber: an active buffer where writers
+append events, and a reader buffer where the reader consumes events.
+.Pp
+All writer-side mutable state and the reader buffer length are packed
+into a single 64-bit word
+.Pq Va packed_state
+with the following bit layout:
+.Bl -column "Bits [63:32]" "swap_allowed" -offset indent
+.It Sy Bits Ta Sy Field Ta Sy Description
+.It Li [63:32] Ta Va reader_len Ta Bytes in reader buffer (set at swap, 30 bits)
+.It Li [31:2] Ta Va commit_pos Ta Byte offset of next write (30 bits)
+.It Li [1] Ta Va swap_allowed Ta Reader has drained; writers may swap
+.It Li [0] Ta Va active_buf Ta Which buffer is active (0 or 1)
+.El
+.Pp
+On targets where the MI
+.Fn atomic_*_64
+API is available
+.Pq Dv __LP64__ , i.e.\& every 64-bit architecture ,
+every state transition (writer commit, buffer swap, reader drain) is a
+single
+.Fn atomic_fcmpset_64 ;
+the path is lock-free and NMI-safe by construction.
+.Pp
+On 32-bit targets that do not provide
+.Fn atomic_*_64
+(FreeBSD's
+.Pa sys/atomic_common.h
+gates
+.Fn atomic_load_64
+on
+.Dv __LP64__
+for the same reason),
+.Va packed_state
+is the same 64-bit word but every state operation takes a per-pcpu_buf
+.Dv MTX_SPIN
+that serialises access to the otherwise non-atomic 64-bit field.
+NMI-safety is provided up-front: the writer entry point
+.Fn eventlog_subscriber_write_event_device
+calls
+.Fn mtx_owned "&pcpu_buf->swap_lock"
+and drops the event if true.
+In NMI context
+.Va curthread
+is the interrupted thread, so
+.Fn mtx_owned
+is true exactly when an NMI fired on a thread that already holds the
+swap lock; calling
+.Fn mtx_lock_spin
+in that case would deadlock the NMI handler against the interrupted
+thread, so dropping the event (counted in
+.Va dropped_events )
+is the only safe choice.
+.Pp
+Because
+.Fn smr_enter
+disables thread preemption and pins the thread to a CPU, no two threads
+can write the same per-CPU buffer concurrently.
+A hardware NMI on the same CPU can still nest inside an in-progress
+thread-level writer (NMIs are not blocked by critical sections), so the
+commit-CAS retry loop re-derives
+.Va active
+and
+.Va commit_pos
+from the post-CAS state on every failure and redoes the write at the new
+offset; an NMI's intervening commit (with or without a buffer swap) is
+therefore preserved.
+.Pp
+Buffer swap publishes the frozen
+.Va commit_pos
+as
+.Va reader_len
+and flips
+.Va active_buf .
+A reader observing
+.Va swap_allowed = 0
+is guaranteed to see the matching
+.Va reader_len > 0 .
+Writers perform a proactive swap when the active buffer is full and
+.Va swap_allowed
+is set; otherwise the event is dropped and the subscriber's
+.Va dropped_events
+counter is incremented.
+The reader swaps buffers when it needs data.
+Writers never spin waiting for the reader.
+.Ss Timestamp Epoch Boundary
+Each
+.Fn read
+call delivers events bounded by a per-read epoch.
+After the initial buffer swap, the reader captures a
+.Va read_timestamp .
+Events with timestamps beyond this epoch remain in the reader buffer
+for the next
+.Fn read
+call.
+.Pp
+During the merge loop, events are delivered in strict timestamp order
+using a min-heap of CPUs sorted by next-event timestamp.
+When an epoch boundary is hit, a resweep of idle CPUs catches events
+from writers that committed within the epoch but whose CPU was previously
+inactive.
+.Ss Schema-Based Code Generation
+Provider-specific event schemas are defined in
+.Pa include/eventlog/
+using
+.Pa .src
+files with the naming convention
+.Ao Ar provider Ac Ns Pa _eventlog_schema.src .
+The
+.Pa eventlog_gen.awk
+script processes these files in two modes:
+.Bl -tag -width indent
+.It Sy Producer mode Pq Fl h
+Generates event structure definitions,
+.Ql _ENABLED()
+check macros,
+.Ql _LOG()
+macros
+.Pq with enablement check ,
+and
+.Ql _LOG_ALWAYS()
+macros
+.Pq unconditional .
+.It Sy Consumer mode Pq Fl c
+Generates payload formatters, enum and flag lookup functions, and
+.Fn event_id_to_name
+dispatch functions for user-space tools.
+.El
+.Pp
+The generated code follows this pattern:
+.Bd -literal -offset indent
+#define TCP_EVENTLOG_IN_ENABLED(__session) \e
+ ((__session) != NULL && \e
+ (__session)->effective_level >= EVENTLOG_LEVEL_VERBOSE && \e
+ ((__session)->effective_keywords & TCP_EVENTLOG_KEYWORD_RX))
+
+#define TCP_EVENTLOG_IN_LOG(__session, ...) \e
+ do { \e
+ if (TCP_EVENTLOG_IN_ENABLED(__session)) \e
+ TCP_EVENTLOG_IN_LOG_ALWAYS(__session, ...); \e
+ } while (0)
+.Ed
+.Pp
+The enablement check reads two structure fields directly with no
+function call and no lock, making the fast path when disabled a single
+branch that is not taken.
+.Ss Device Interface
+The
+.Pa /dev/eventlog
+character device supports the following operations:
+.Bl -tag -width indent
+.It Fn open
+Opens the device.
+No subscriber is created at this point.
+The framework is host-global and is not exposed to jailed processes:
+.Fn open
+fails with
+.Er EPERM
+when the calling thread is in a jail
+.Pq Va cr_prison No is not Va prison0 .
+.It Fn close
+Destroys the subscriber if one was created, and updates provider
+enablement.
+.It Fn read
+Reads events merged by timestamp across per-CPU buffers.
+Each
+.Fn read
+returns zero or more complete events; no partial events are delivered.
+Blocks for up to one second if no data is available, unless
+.Dv FNONBLOCK
+is set.
+.It Fn ioctl
+Manages subscriptions and buffer configuration using the following
+commands:
+.Bl -tag -width indent
+.It Dv EVENTLOG_IOCTL_CREATE_SIZE Ns Pq Fa count
+Creates a subscriber with the specified per-CPU buffer size and
+subscribes to providers in one atomic operation.
+The argument is a variable-length
+.Vt "struct eventlog_create_req" .
+.It Dv EVENTLOG_IOCTL_DESTROY
+Unsubscribes from all providers and destroys the subscriber.
+.It Dv EVENTLOG_IOCTL_GET_STATS
+Returns the subscriber's
+.Va dropped_events
+count.
+.It Dv EVENTLOG_IOCTL_GET_PROVIDERS
+Returns the list of subscribed providers with their numeric IDs and
+names.
+Multiple entries may share the same name when multi-provider support
+is in use.
+.El
+.El
+.Ss Initialization
+The framework initializes in three boot-time phases:
+.Bl -enum -compact
+.It
+Mutexes are initialized at
+.Dv SI_SUB_LOCK .
+.It
+The session UMA zone is created at
+.Dv SI_SUB_KMEM .
+.It
+The
+.Pa /dev/eventlog
+character device is created at
+.Dv SI_SUB_DRIVERS .
+.El
+.Pp
+Providers register themselves during their own subsystem initialization.
+.Sh RETURN VALUES
+The
+.Fn eventlog_provider_create
+function returns a pointer to the new provider, or
+.Dv NULL
+on failure.
+.Pp
+The
+.Fn eventlog_session_create
+function returns a pointer to the new session, or
+.Dv NULL
+if allocation fails
+.Pq when Fa waitok No is false .
+.Pp
+The
+.Fn eventlog_provider_get_default
+function returns 0
+.Pq sessions start disabled
+or 1
+.Pq sessions start enabled .
+.Pp
+The
+.Fn eventlog_provider_get_sysctl_node
+and
+.Fn eventlog_provider_get_sysctl_ctx
+functions return the provider's auto-generated
+.Xr sysctl 9
+node and its backing context, which are owned by the framework.
+Both always succeed for a valid provider.
+.Pp
+The
+.Fn eventlog_session_is_enabled
+function returns non-zero if the session is enabled, or 0 if disabled or
+.Dv NULL .
+.Pp
+The
+.Fn eventlog_provider_get_level
+function returns the current aggregate log level.
+.Pp
+The
+.Fn eventlog_provider_get_keywords
+function returns the current aggregate keyword mask.
+.Pp
+The
+.Fn eventlog_subscriber_create_device
+and
+.Fn eventlog_subscriber_create_callback
+functions return a pointer to the new subscriber, or
+.Dv NULL
+on failure.
+.Pp
+The
+.Fn eventlog_subscriber_add_subscription
+function returns 0 on success or an error code on failure.
+.Pp
+The
+.Fn eventlog_subscriber_read
+function returns 0 on success or an error code on failure.
+It returns
+.Er EAGAIN
+when no data is available and
+.Dv FNONBLOCK
+is set.
+.Sh ERRORS
+.Fn open
+on
+.Pa /dev/eventlog
+may fail with:
+.Bl -tag -width Er
+.It Bq Er EPERM
+The calling thread is in a jail
+.Pq Va cr_prison No is not Va prison0 .
+The eventlog framework is host-global and is not exposed to jailed
+processes.
+.It Bq Er ENODEV
+The device was opened with
+.Dv FWRITE ,
+.Dv FEXEC ,
+.Dv FAPPEND ,
+or
+.Dv O_TRUNC .
+The device is read-only.
+.El
+.Sh SEE ALSO
+.Xr elog 1 ,
+.Xr elog 5 ,
+.Xr tcp 4 ,
+.Xr smr 9 ,
+.Xr sysctl 9 ,
+.Xr tcp_functions 9
+.Sh HISTORY
+The
+.Nm
+framework first appeared in
+.Fx 16.0 .
+.Sh AUTHORS
+The
+.Nm
+framework was developed by
+.An Netflix, Inc .
diff --git a/sys/conf/files b/sys/conf/files
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -3862,6 +3862,7 @@
kern/kern_environment.c standard
kern/kern_et.c standard
kern/kern_event.c standard
+kern/kern_eventlog.c standard
kern/kern_exec.c standard
kern/kern_exit.c standard
kern/kern_fail.c standard
diff --git a/sys/conf/kern.pre.mk b/sys/conf/kern.pre.mk
--- a/sys/conf/kern.pre.mk
+++ b/sys/conf/kern.pre.mk
@@ -74,6 +74,44 @@
INCLUDES= ${NOSTDINC} ${INCLMAGIC} -I. -I$S -I$S/contrib/ck/include
+# Generate eventlog provider headers from schema files in
+# include/eventlog/. Each *_eventlog_schema.src is fed through
+# include/eventlog/eventlog_gen.awk to produce <provider>_eventlog.h
+# under ${.OBJDIR}/include/eventlog/, and that directory's parent is
+# added to INCLUDES so kernel sources can do
+# #include <eventlog/<provider>_eventlog.h>.
+_EVENTLOG_HEADER_DIR= ${.OBJDIR}/include/eventlog
+_EVENTLOG_SCHEMA_DIR= ${SRCTOP}/include/eventlog
+_EVENTLOG_SCHEMAS!= find ${_EVENTLOG_SCHEMA_DIR} -name '*_eventlog_schema.src' -type f 2>/dev/null | ${AWK} -F/ '{print $$NF}' || echo ""
+.if !empty(_EVENTLOG_SCHEMAS)
+.if !make(clean) && !make(cleandir) && !make(clobber)
+_EVENTLOG_GENHDRS!= mkdir -p ${_EVENTLOG_HEADER_DIR}; \
+ awk_script="${_EVENTLOG_SCHEMA_DIR}/eventlog_gen.awk"; \
+ for schema in ${_EVENTLOG_SCHEMAS}; do \
+ schema_path="${_EVENTLOG_SCHEMA_DIR}/$$schema"; \
+ provider=$$(${AWK} '/^PROVIDER/ {print tolower($$2); exit}' "$$schema_path" 2>/dev/null); \
+ [ -n "$$provider" ] || continue; \
+ header="${_EVENTLOG_HEADER_DIR}/$${provider}_eventlog.h"; \
+ if [ ! -f "$$header" ] || \
+ [ "$$schema_path" -nt "$$header" ] || \
+ [ "$$awk_script" -nt "$$header" ]; then \
+ cd ${SRCTOP} && ${AWK} -v outdir="${_EVENTLOG_HEADER_DIR}" -f include/eventlog/eventlog_gen.awk include/eventlog/$$schema -h; \
+ fi; \
+ done; echo done
+.endif
+.for schema in ${_EVENTLOG_SCHEMAS}
+_EVENTLOG_PROVIDER_${schema}!= ${AWK} '/^PROVIDER/ {print tolower($$2); exit}' ${_EVENTLOG_SCHEMA_DIR}/${schema} 2>/dev/null || echo ""
+.if !empty(_EVENTLOG_PROVIDER_${schema})
+_EVENTLOG_HEADER_${schema}= ${_EVENTLOG_HEADER_DIR}/${_EVENTLOG_PROVIDER_${schema}}_eventlog.h
+${_EVENTLOG_HEADER_${schema}}: ${_EVENTLOG_SCHEMA_DIR}/eventlog_gen.awk ${_EVENTLOG_SCHEMA_DIR}/${schema}
+ @mkdir -p ${_EVENTLOG_HEADER_DIR}
+ @cd ${SRCTOP} && ${AWK} -v outdir="${_EVENTLOG_HEADER_DIR}" -f include/eventlog/eventlog_gen.awk include/eventlog/${schema} -h
+BEFORE_DEPEND+= ${_EVENTLOG_HEADER_${schema}}
+.endif
+.endfor
+INCLUDES+= -I${_EVENTLOG_HEADER_DIR:H}
+.endif
+
CFLAGS= ${COPTFLAGS} ${DEBUG}
CFLAGS+= ${INCLUDES} -D_KERNEL -DHAVE_KERNEL_OPTION_HEADERS -include opt_global.h
CFLAGS_PARAM_INLINE_UNIT_GROWTH?=100
diff --git a/sys/conf/kmod.mk b/sys/conf/kmod.mk
--- a/sys/conf/kmod.mk
+++ b/sys/conf/kmod.mk
@@ -491,6 +491,37 @@
${SYSDIR}/dev/bhnd/nvram/nvram_map -h
.endif
+# Generate an eventlog provider header from a single schema file. A kmod
+# opts in by setting EVENTLOG_SCHEMA=<provider>_eventlog_schema.src; the
+# schema is fed through include/eventlog/eventlog_gen.awk to produce
+# <provider>_eventlog.h under ${OBJTOP}/sys/include/eventlog/. The
+# header is added to SRCS so depend / clean see it, and its parent
+# directory is added to -I so the kmod's sources can do
+# #include <eventlog/<provider>_eventlog.h>.
+.if !empty(EVENTLOG_SCHEMA)
+EVENTLOG_SCHEMA_PATH= ${SRCTOP}/include/eventlog/${EVENTLOG_SCHEMA}
+EVENTLOG_PROVIDER!= ${AWK} '/^PROVIDER/ {print tolower($$2); exit}' ${EVENTLOG_SCHEMA_PATH}
+EVENTLOG_HEADER_DIR= ${OBJTOP}/sys/include/eventlog
+EVENTLOG_HEADER_DIR:= ${EVENTLOG_HEADER_DIR:tA}
+EVENTLOG_HEADER= ${EVENTLOG_PROVIDER}_eventlog.h
+EVENTLOG_HEADER_PATH= ${EVENTLOG_HEADER_DIR}/${EVENTLOG_HEADER}
+SRCS+= ${EVENTLOG_HEADER_PATH}
+CLEANFILES+= ${EVENTLOG_HEADER_PATH}
+.if !make(clean) && !make(cleandir) && !make(clobber)
+_EVENTLOG_GENHDR!= mkdir -p ${EVENTLOG_HEADER_DIR}; \
+ if [ ! -f ${EVENTLOG_HEADER_PATH} ] || \
+ [ ${EVENTLOG_SCHEMA_PATH} -nt ${EVENTLOG_HEADER_PATH} ] || \
+ [ ${SRCTOP}/include/eventlog/eventlog_gen.awk -nt ${EVENTLOG_HEADER_PATH} ]; then \
+ cd ${SRCTOP} && ${AWK} -v outdir=${EVENTLOG_HEADER_DIR} -f include/eventlog/eventlog_gen.awk ${EVENTLOG_SCHEMA_PATH} -h; \
+ fi; echo done
+.endif
+CFLAGS+= -I${EVENTLOG_HEADER_DIR:H}
+${EVENTLOG_HEADER_PATH}: ${SRCTOP}/include/eventlog/eventlog_gen.awk ${EVENTLOG_SCHEMA_PATH}
+ @mkdir -p ${EVENTLOG_HEADER_DIR}
+ @cd ${SRCTOP} && ${AWK} -v outdir=${EVENTLOG_HEADER_DIR} -f include/eventlog/eventlog_gen.awk ${EVENTLOG_SCHEMA_PATH} -h
+beforedepend: ${EVENTLOG_HEADER_PATH}
+.endif
+
.if !empty(SRCS:Mbhnd_nvram_map_data.h)
CLEANFILES+= bhnd_nvram_map_data.h
bhnd_nvram_map_data.h: ${SYSDIR}/dev/bhnd/tools/nvram_map_gen.awk \
diff --git a/sys/kern/kern_eventlog.c b/sys/kern/kern_eventlog.c
new file mode 100644
--- /dev/null
+++ b/sys/kern/kern_eventlog.c
@@ -0,0 +1,2630 @@
+/*
+ * Copyright (c) 2026 Netflix, Inc.
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+/*
+ * MEMORY ACCESS AND SYNCHRONIZATION MODEL
+ * =======================================
+ * Per-CPU double-buffering: Two buffers per CPU. Writers use "active" buffer;
+ * readers use "reader" buffer (1 - active). Swap when reader is empty and
+ * active has data.
+ *
+ * Invariant: There is NEVER partial data in either buffer. Each buffer
+ * contains zero or more complete events (header + payload).
+ *
+ * Per-CPU writer concurrency: All write paths enter via smr_enter() which
+ * calls critical_enter(), disabling thread preemption. Thread-level
+ * writer-vs-writer contention is therefore impossible. Hardware NMIs are
+ * NOT blocked by critical sections, however, so an NMI-context writer can
+ * nest inside an in-progress thread-level writer on the same CPU. The
+ * protocol tolerates this: every state-changing step (try_swap, commit-CAS)
+ * re-derives active and commit_pos from the post-CAS state and re-checks
+ * capacity, so an NMI's intervening commit (with or without a buffer swap)
+ * is preserved.
+ *
+ * Packed state: reader_len (30 bits), commit_pos (30 bits), active_buf, and
+ * swap_allowed are packed into a single 64-bit word (packed_state). A
+ * single CAS atomically publishes any state transition (writer commit,
+ * buffer swap, reader drain).
+ *
+ * Writer:
+ * (1) Load packed_state to get commit_pos and active_buf.
+ * (2) Check capacity: if commit_pos + event_len > buffer_size, attempt a
+ * proactive swap if swap_allowed. After try_swap, re-derive active
+ * and commit_pos from the post-CAS state and re-check capacity (an
+ * NMI on this CPU may have already swapped or partially filled the
+ * new active buffer). Drops only if swap is not allowed (reader
+ * still draining) or no room remains after the swap.
+ * (3) Write: memcpy event data to buffer at commit_pos offset.
+ * (4) Commit: CAS packed_state to advance commit_pos. On CAS failure,
+ * re-derive active and commit_pos from the updated state. If either
+ * changed (peer reader swap, NMI commit, or NMI swap-and-commit),
+ * redo the write at the new offset; otherwise just recompute the
+ * desired packed_state value and retry the CAS.
+ *
+ * Reader: Single reader only. Reads from reader buffer. No lock needed for
+ * reads. Advances read_pos by full event lengths. When fully drained, zeros
+ * read_pos/reader_len then eagerly sets swap_allowed (giving writers the
+ * earliest possible permission to proactively swap on buffer-full).
+ *
+ * Swap publication:
+ * reader_len is packed into the upper 32 bits of packed_state and the
+ * swap is a single transition that flips active_buf, zeros commit_pos,
+ * clears swap_allowed, and publishes reader_len = old commit_pos. Two
+ * concurrent try_swap callers cannot clobber each other: exactly one
+ * wins; the loser sees the post-swap state and reader_len = winner's
+ * commit_pos.
+ *
+ * On targets where the MI atomic_*_64 API is available (__LP64__, i.e.
+ * every 64-bit FreeBSD architecture) the swap is a single
+ * atomic_fcmpset_64; the path is lock-free and NMI-safe by construction.
+ *
+ * On 32-bit targets that do not provide atomic_*_64 (FreeBSD's MI
+ * atomic_load_64 is itself gated on __LP64__),
+ * the same 64-bit packed_state is used but every state operation (load,
+ * commit, swap, drain) takes a per-pcpu_buf MTX_SPIN that serialises
+ * access to the otherwise non-atomic 64-bit field. No atomics are
+ * needed inside the helpers; the lock provides both serialisation and
+ * visibility. NMI-safety is provided up-front: the writer entry point
+ * checks mtx_owned(&pcpu_buf->swap_lock) and drops the event if true
+ * (in NMI context curthread is the interrupted thread, so mtx_owned()
+ * being true means we would mtx_lock_spin against ourselves and
+ * deadlock). No caller-visible flag or per-helper trylock is needed.
+ *
+ * Key properties (both implementations):
+ * - Writers NEVER spin waiting for the reader. They perform the swap
+ * themselves or drop if swap is not allowed.
+ * - No critical_enter needed: the writer's commit CAS detects and
+ * handles concurrent reader swaps by retrying.
+ * - Writer can proactively swap on buffer-full when swap_allowed is set,
+ * reducing event drops.
+ * - swap_allowed=0 implies reader_len > 0 (try_swap is only invoked with
+ * commit_pos > 0, and the swap publishes reader_len = commit_pos).
+ */
+
+#define EVENTLOG_INTERNAL
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/condvar.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/jail.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/ck.h>
+#include <sys/smr.h>
+#include <sys/sbuf.h>
+#include <sys/sysctl.h>
+#include <sys/taskqueue.h>
+#include <sys/counter.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/eventlog.h>
+#include <sys/eventlog_subscriber.h>
+#include <sys/smp.h>
+#include <sys/time.h>
+#include <sys/limits.h>
+#include <machine/cpu.h>
+#include <machine/atomic.h>
+#include <sys/conf.h>
+#include <fs/devfs/devfs.h>
+#include <sys/fcntl.h>
+#include <sys/uio.h>
+#include <sys/libkern.h>
+#include <sys/ioccom.h>
+#include <sys/time.h>
+#include <vm/vm.h>
+#include <vm/uma.h>
+
+/* Used to disable inlining to help debug performance issues via flamegraphs. */
+#define EVENTLOG_INLINING //__noinline
+
+MALLOC_DEFINE(M_EVENTLOG, "eventlog", "eventlog subsystem");
+
+/*
+ * Full definition of eventlog_session - private to this file; header
+ * has partial/forward only.
+ */
+struct eventlog_session {
+ enum eventlog_level effective_level;
+ uint32_t effective_keywords;
+ /* Private fields - only visible in this file */
+ struct eventlog_provider *provider;
+ LIST_ENTRY(eventlog_session) link;
+ uint64_t session_id; /* Unique id (e.g., inp_gencnt for TCP) */
+ uint64_t created_at; /* us since boot when session was created */
+ enum eventlog_level override_level;
+ uint32_t override_keywords;
+ uint8_t disabled;
+ uint8_t has_override;
+};
+
+/*
+ * Shared statistics for all providers with the same name.
+ * Reference-counted: created on first provider, freed when last is destroyed.
+ * Protected by evl.providers_lock.
+ */
+struct eventlog_provider_stats {
+ int refcount;
+ int default_enabled;
+ LIST_ENTRY(eventlog_provider_stats) link;
+ counter_u64_t sessions_created;
+ counter_u64_t sessions_active;
+ counter_u64_t sessions_enabled;
+ struct sysctl_ctx_list sysctl_ctx;
+ /*
+ * kern.eventlog.<name>; exposed to providers via
+ * eventlog_provider_get_sysctl_node().
+ */
+ struct sysctl_oid *sysctl_node;
+ char name[EVENTLOG_PROVIDER_NAME_MAX];
+};
+
+/* Full definition of eventlog_provider */
+struct eventlog_provider {
+ struct mtx sessions_lock;
+ LIST_HEAD(, eventlog_session) sessions;
+ LIST_ENTRY(eventlog_provider) link;
+ struct eventlog_provider_stats *stats;
+ eventlog_provider_dump_state_t dump_callback;
+ void *dump_callback_arg;
+ eventlog_default_changed_t default_changed;
+ void *default_changed_arg;
+ eventlog_subscribers_changed_t subscribers_changed;
+ void *subscribers_changed_arg;
+ enum eventlog_level level;
+ uint32_t keywords;
+ bool has_subscribers; /* tracked under sessions_lock */
+ uint16_t provider_id; /* Unique ID assigned on registration */
+ uint8_t name_len; /* excluding null terminator */
+ char name[EVENTLOG_PROVIDER_NAME_MAX];
+};
+
+/*
+ * Full definition of eventlog_subscription. CK_SLIST for lock-free traversal
+ * in SMR read path.
+ */
+struct eventlog_subscription {
+ CK_SLIST_ENTRY(eventlog_subscription) link;
+ struct eventlog_provider *provider;
+ enum eventlog_level level;
+ uint32_t keywords;
+};
+
+/*
+ * Per-CPU buffer structure for double-buffering. See "MEMORY ACCESS AND
+ * SYNCHRONIZATION MODEL" at the top of this file for the protocol; this
+ * block documents only the data layout.
+ *
+ * packed_state layout:
+ * [63:32] reader_len - bytes in reader buffer (set at swap)
+ * [31:2] commit_pos - bytes committed to active buffer (= write cursor)
+ * [1] swap_allowed - reader buffer is empty, writer may proactively swap
+ * [0] active_buf - which buffer (0 or 1) is the active writer buffer
+ *
+ * 30-bit commit_pos and 30-bit reader_len each support buffers up to 1 GB
+ * (the enforced maximum). Initialised with swap_allowed=1. Because
+ * commit_pos lives in [31:2], a writer commit can simply add
+ * (event_len << 2) to packed_state without disturbing the upper bits
+ * (commit_pos + event_len <= buffer_size <= 1 GB rules out overflow).
+ * The SMR critical section pins the writer to one CPU, so commit_pos
+ * also serves as the writer's reservation cursor.
+ *
+ * EVENTLOG_FORCE_SWAP_LOCK overrides the LP64 detection so the fallback
+ * path can be compile- and run-tested on 64-bit hosts.
+ */
+#if defined(__LP64__) && !defined(EVENTLOG_FORCE_SWAP_LOCK)
+#define EVENTLOG_HAS_ATOMIC64 1
+#endif
+
+#define EVTLOG_ACTIVE_BUF 0x1U
+#define EVTLOG_SWAP_ALLOWED 0x2U
+#define EVTLOG_COMMIT_SHIFT 2
+
+#define EVTLOG_READER_LEN_SHIFT 32
+#define EVTLOG_PACK_READER_LEN(rl) \
+ (((uint64_t)(uint32_t)(rl)) << EVTLOG_READER_LEN_SHIFT)
+#define EVTLOG_READER_LEN_MASK \
+ (((uint64_t)UINT32_MAX) << EVTLOG_READER_LEN_SHIFT)
+
+struct eventlog_percpu_buffer {
+ void *buffers[2]; /* Two buffers: [0] and [1] */
+ uint32_t buffer_size;
+ uint32_t read_pos; /* Read cursor in reader buffer */
+#ifndef EVENTLOG_HAS_ATOMIC64
+ struct mtx swap_lock; /* MTX_SPIN; covers all state ops */
+#endif
+ volatile uint64_t packed_state; /* See layout above */
+} __aligned(CACHE_LINE_SIZE);
+
+/*
+ * Atomic state abstraction. evtlog_state_t carries the entire per-CPU
+ * buffer state observable by callers as a single uint64_t with the layout
+ * documented above. Both implementations operate on the same word; only
+ * the synchronisation primitive (atomic_*_64 vs spin-mutex) differs.
+ */
+typedef uint64_t evtlog_state_t;
+
+static inline int
+evtlog_state_active(evtlog_state_t s)
+{
+ return ((int)(s & EVTLOG_ACTIVE_BUF));
+}
+
+static inline uint32_t
+evtlog_state_commit_pos(evtlog_state_t s)
+{
+ return (((uint32_t)s) >> EVTLOG_COMMIT_SHIFT);
+}
+
+static inline bool
+evtlog_state_swap_allowed(evtlog_state_t s)
+{
+ return ((s & EVTLOG_SWAP_ALLOWED) != 0);
+}
+
+static inline uint32_t
+evtlog_state_reader_len(evtlog_state_t s)
+{
+ return ((uint32_t)(s >> EVTLOG_READER_LEN_SHIFT));
+}
+
+static inline evtlog_state_t
+evtlog_load_state(struct eventlog_percpu_buffer *pcpu)
+{
+#ifdef EVENTLOG_HAS_ATOMIC64
+ return (atomic_load_acq_64(&pcpu->packed_state));
+#else
+ evtlog_state_t s;
+
+ mtx_lock_spin(&pcpu->swap_lock);
+ s = pcpu->packed_state;
+ mtx_unlock_spin(&pcpu->swap_lock);
+ return (s);
+#endif
+}
+
+/*
+ * Atomically advance commit_pos by event_len. Returns true on success;
+ * on failure, *state is updated to the current packed state so the caller
+ * can re-derive active and commit_pos and decide whether to redo the write.
+ */
+static inline bool
+evtlog_try_commit(struct eventlog_percpu_buffer *pcpu,
+ evtlog_state_t *state, uint32_t event_len)
+{
+ evtlog_state_t new_state;
+#ifndef EVENTLOG_HAS_ATOMIC64
+ bool ok;
+#endif
+
+ new_state = *state + ((uint64_t)event_len << EVTLOG_COMMIT_SHIFT);
+#ifdef EVENTLOG_HAS_ATOMIC64
+ return (atomic_fcmpset_64(&pcpu->packed_state, state, new_state));
+#else
+ mtx_lock_spin(&pcpu->swap_lock);
+ if (pcpu->packed_state == *state) {
+ pcpu->packed_state = new_state;
+ *state = new_state;
+ ok = true;
+ } else {
+ *state = pcpu->packed_state;
+ ok = false;
+ }
+ mtx_unlock_spin(&pcpu->swap_lock);
+ return (ok);
+#endif
+}
+
+/*
+ * Try to perform a buffer swap atomically. See "Swap publication" in the
+ * SYNC MODEL at the top of this file for the protocol and the per-impl
+ * synchronisation primitive.
+ *
+ * On success returns true and *old_state is updated to the post-swap
+ * state (active_buf flipped, commit_pos=0, swap_allowed clear, reader_len
+ * = pre-swap commit_pos). On failure returns false and *old_state is
+ * refreshed with the latest observed packed state so the caller can
+ * re-check capacity after a peer swap.
+ *
+ * Precondition: commit_pos > 0 in *old_state.
+ */
+static inline bool
+evtlog_try_swap(struct eventlog_percpu_buffer *pcpu,
+ evtlog_state_t *old_state)
+{
+ evtlog_state_t state = *old_state;
+ evtlog_state_t new_state;
+ uint32_t commit;
+#ifndef EVENTLOG_HAS_ATOMIC64
+ bool ok;
+#endif
+
+ commit = evtlog_state_commit_pos(state);
+ MPASS(commit > 0);
+ new_state = ((state & EVTLOG_ACTIVE_BUF) ^ EVTLOG_ACTIVE_BUF) |
+ EVTLOG_PACK_READER_LEN(commit);
+
+#ifdef EVENTLOG_HAS_ATOMIC64
+ if (!atomic_fcmpset_64(&pcpu->packed_state, old_state, new_state))
+ return (false);
+#else
+ mtx_lock_spin(&pcpu->swap_lock);
+ if (pcpu->packed_state == *old_state) {
+ pcpu->packed_state = new_state;
+ ok = true;
+ } else {
+ *old_state = pcpu->packed_state;
+ ok = false;
+ }
+ mtx_unlock_spin(&pcpu->swap_lock);
+ if (!ok)
+ return (false);
+#endif
+ *old_state = new_state;
+ return (true);
+}
+
+/*
+ * Mark the reader buffer empty: clear reader_len and set swap_allowed.
+ * Caller must have just consumed all bytes in the reader buffer
+ * (read_pos == reader_len) and runs in the single reader thread (never
+ * NMI), so blocking on the swap lock in the fallback path is safe.
+ */
+static inline void
+evtlog_drain_complete(struct eventlog_percpu_buffer *pcpu_buf)
+{
+#ifdef EVENTLOG_HAS_ATOMIC64
+ uint64_t state, new_state;
+
+ pcpu_buf->read_pos = 0;
+ state = atomic_load_acq_64(&pcpu_buf->packed_state);
+ do {
+ new_state = (state & ~EVTLOG_READER_LEN_MASK) |
+ EVTLOG_SWAP_ALLOWED;
+ } while (!atomic_fcmpset_64(&pcpu_buf->packed_state, &state,
+ new_state));
+#else
+ mtx_lock_spin(&pcpu_buf->swap_lock);
+ pcpu_buf->read_pos = 0;
+ pcpu_buf->packed_state = (pcpu_buf->packed_state &
+ ~EVTLOG_READER_LEN_MASK) | EVTLOG_SWAP_ALLOWED;
+ mtx_unlock_spin(&pcpu_buf->swap_lock);
+#endif
+}
+
+/*
+ * Validate that a buffer contains only complete events (no partial data).
+ * buffer: pointer to buffer, buffer_size: capacity, start: offset to begin,
+ * written_len: bytes of data. Call with __LINE__ for panic diagnostics.
+ */
+#ifdef INVARIANTS
+static inline void
+eventlog_validate_buffer(void *buffer, size_t buffer_size, size_t start,
+ size_t written_len, int line)
+{
+ size_t offset = start;
+ struct eventlog_event_header hdr;
+
+ KASSERT(start <= written_len,
+ ("%s: start %zu > written_len %zu (caller line %d)",
+ __func__, start, written_len, line));
+ KASSERT(written_len <= buffer_size,
+ ("%s: written_len %zu > buffer_size %zu (caller line %d)",
+ __func__, written_len, buffer_size, line));
+ if (written_len == 0)
+ return;
+ KASSERT(written_len >= sizeof(struct eventlog_event_header),
+ ("%s: partial data, written_len %zu < header (line %d)",
+ __func__, written_len, line));
+ while (offset < written_len) {
+ KASSERT(
+ offset + sizeof(struct eventlog_event_header) <=
+ written_len,
+ ("%s: truncated header at offset %zu (line %d)",
+ __func__, offset, line));
+ memcpy(&hdr, (const uint8_t *)buffer + offset,
+ sizeof(struct eventlog_event_header));
+ KASSERT(hdr.event_length >=
+ sizeof(struct eventlog_event_header),
+ ("%s: invalid event_length %u at offset %zu (line %d)",
+ __func__, hdr.event_length, offset, line));
+ KASSERT(offset + hdr.event_length <= written_len,
+ ("%s: event overrun at offset %zu len %u (line %d)",
+ __func__, offset, hdr.event_length, line));
+ offset += hdr.event_length;
+ }
+ KASSERT(offset == written_len,
+ ("%s: partial event at end, offset %zu != written_len %zu"
+ " (caller line %d)",
+ __func__, offset, written_len, line));
+}
+
+#define EVENTLOG_VALIDATE_READER(pcpu_buf) do { \
+ evtlog_state_t _vs = evtlog_load_state(pcpu_buf); \
+ eventlog_validate_buffer( \
+ (pcpu_buf)->buffers[1 - evtlog_state_active(_vs)], \
+ (pcpu_buf)->buffer_size, (pcpu_buf)->read_pos, \
+ evtlog_state_reader_len(_vs), __LINE__); \
+} while (0)
+#define EVENTLOG_VALIDATE_WRITER(pcpu_buf) do { \
+ evtlog_state_t _vs = evtlog_load_state(pcpu_buf); \
+ eventlog_validate_buffer( \
+ (pcpu_buf)->buffers[evtlog_state_active(_vs)], \
+ (pcpu_buf)->buffer_size, 0, \
+ evtlog_state_commit_pos(_vs), __LINE__); \
+} while (0)
+#else
+#define EVENTLOG_VALIDATE_READER(pcpu_buf) do { } while (0)
+#define EVENTLOG_VALIDATE_WRITER(pcpu_buf) do { } while (0)
+#endif
+
+static inline uint64_t
+eventlog_read_timestamp(const void *buf)
+{
+ return (((const struct eventlog_event_header *)buf)->timestamp);
+}
+
+/*
+ * Peek at the next event's timestamp from a CPU buffer's reader buffer.
+ * Does not advance the buffer read position.
+ */
+static EVENTLOG_INLINING uint64_t
+eventlog_peek_next_timestamp(struct eventlog_percpu_buffer *pcpu_buf)
+{
+ const uint8_t *ptr;
+ int reader = 1 - evtlog_state_active(evtlog_load_state(pcpu_buf));
+
+ EVENTLOG_VALIDATE_READER(pcpu_buf);
+ ptr = (const uint8_t *)pcpu_buf->buffers[reader] + pcpu_buf->read_pos;
+ return (eventlog_read_timestamp(ptr));
+}
+
+/* Sentinel for timestamp: no next-event timestamp. */
+#define EVENTLOG_TIMESTAMP_NONE UINT64_MAX
+/* CPU already checked by resweep during this read (skip next time). */
+#define EVENTLOG_TIMESTAMP_SWEPT (UINT64_MAX - 1)
+
+/* Full definition of eventlog_subscriber (internal only) */
+CK_LIST_HEAD(eventlog_subscriber_head, eventlog_subscriber);
+struct eventlog_subscriber {
+ CK_LIST_ENTRY(eventlog_subscriber) link;
+ CK_SLIST_HEAD(, eventlog_subscription) subscriptions;
+ enum eventlog_subscriber_type type;
+
+ union {
+ /* Device-based subscriber: per-CPU buffers */
+ struct {
+ struct eventlog_percpu_buffer *percpu_buffers;
+ uint32_t buffer_size_per_cpu;
+ /* Atomic: non-zero if reader is waiting. */
+ volatile uint32_t reader_waiting;
+ /* [maxcpu] next-event timestamp per CPU. */
+ uint64_t *cpu_timestamps;
+ /* Min-heap of CPU indices by timestamp. */
+ uint16_t *heap_cpus;
+ uint16_t heap_size; /* Number of CPUs in heap */
+ } device;
+ /* Callback-based subscriber: callback function */
+ struct {
+ eventlog_callback_t callback;
+ void *callback_arg;
+ } callback;
+ } u;
+
+ /*
+ * Async dump_state coordination. dump_pending counts queued +
+ * in-flight dump tasks targeting this subscriber; destroy/drain
+ * waits on the cv until it hits zero. The mtx covers both fields.
+ */
+ struct mtx dump_pending_mtx;
+ struct cv dump_pending_cv;
+ u_int dump_pending;
+
+ /* Statistics */
+ volatile u_long dropped_events;
+};
+
+/*
+ * Min-heap of CPU indices ordered by next-event timestamp.
+ * heap_cpus[0] is the CPU with minimum timestamp when heap_size > 0.
+ * Stored as implicit binary heap: parent at i, children at 2*i+1 and 2*i+2.
+ */
+
+/*
+ * Insert (cpu, timestamp) into the min-heap. O(log n).
+ */
+static EVENTLOG_INLINING void
+eventlog_heap_insert(struct eventlog_subscriber *subscriber, uint16_t cpu,
+ uint64_t timestamp)
+{
+ uint64_t *timestamps = subscriber->u.device.cpu_timestamps;
+ uint16_t *heap_cpus = subscriber->u.device.heap_cpus;
+ uint16_t *heap_size = &subscriber->u.device.heap_size;
+ size_t i;
+
+ timestamps[cpu] = timestamp;
+
+ if (*heap_size == 0) {
+ heap_cpus[0] = cpu;
+ *heap_size = 1;
+ return;
+ }
+
+ /* Add at end, bubble up */
+ i = (*heap_size)++;
+ heap_cpus[i] = cpu;
+ while (i > 0) {
+ size_t parent = (i - 1) / 2;
+ if (timestamps[heap_cpus[parent]] <= timestamps[cpu])
+ break;
+ heap_cpus[i] = heap_cpus[parent];
+ i = parent;
+ }
+ heap_cpus[i] = cpu;
+}
+
+/*
+ * Extract the CPU with minimum timestamp from the heap. Caller must ensure
+ * heap_size > 0. O(log n).
+ */
+static inline void
+eventlog_heap_extract_min(struct eventlog_subscriber *subscriber)
+{
+ uint64_t *timestamps = subscriber->u.device.cpu_timestamps;
+ uint16_t *heap_cpus = subscriber->u.device.heap_cpus;
+ uint16_t *heap_size = &subscriber->u.device.heap_size;
+ uint16_t replaced;
+ size_t i, smallest;
+
+ MPASS(*heap_size > 0);
+
+ timestamps[heap_cpus[0]] = EVENTLOG_TIMESTAMP_NONE;
+
+ if (*heap_size == 1) {
+ *heap_size = 0;
+ return;
+ }
+
+ replaced = heap_cpus[--*heap_size];
+ heap_cpus[0] = replaced;
+ i = 0;
+
+ /* Heapify down */
+ while (1) {
+ size_t left = 2 * i + 1;
+ size_t right = 2 * i + 2;
+
+ smallest = i;
+ if (left < *heap_size &&
+ timestamps[heap_cpus[left]] <
+ timestamps[heap_cpus[smallest]])
+ smallest = left;
+ if (right < *heap_size &&
+ timestamps[heap_cpus[right]] <
+ timestamps[heap_cpus[smallest]])
+ smallest = right;
+
+ if (smallest == i)
+ break;
+
+ heap_cpus[i] = heap_cpus[smallest];
+ i = smallest;
+ }
+ heap_cpus[i] = replaced;
+}
+
+/*
+ * Update the root's timestamp (root key increased) and restore heap property.
+ * Replaces extract_min + heap_insert when we only need to update the root CPU.
+ * Caller must ensure heap_size > 0.
+ */
+static inline void
+eventlog_heap_update_root(struct eventlog_subscriber *subscriber,
+ uint64_t new_timestamp)
+{
+ uint64_t *timestamps = subscriber->u.device.cpu_timestamps;
+ uint16_t *heap_cpus = subscriber->u.device.heap_cpus;
+ uint16_t *heap_size = &subscriber->u.device.heap_size;
+ uint16_t root_cpu;
+ size_t i, smallest;
+
+ MPASS(*heap_size > 0);
+
+ root_cpu = heap_cpus[0];
+ timestamps[root_cpu] = new_timestamp;
+ i = 0;
+
+ /* Sift down from root */
+ while (1) {
+ size_t left = 2 * i + 1;
+ size_t right = 2 * i + 2;
+
+ smallest = i;
+ if (left < *heap_size &&
+ timestamps[heap_cpus[left]] <
+ timestamps[heap_cpus[smallest]])
+ smallest = left;
+ if (right < *heap_size &&
+ timestamps[heap_cpus[right]] <
+ timestamps[heap_cpus[smallest]])
+ smallest = right;
+
+ if (smallest == i)
+ break;
+
+ heap_cpus[i] = heap_cpus[smallest];
+ i = smallest;
+ }
+ heap_cpus[i] = root_cpu;
+}
+
+/*
+ * Return the second-smallest timestamp (for max_timestamp bound), or UINT64_MAX
+ * if heap has fewer than 2 elements.
+ */
+static inline uint64_t
+eventlog_heap_second_min_timestamp(struct eventlog_subscriber *subscriber)
+{
+ uint64_t *timestamps = subscriber->u.device.cpu_timestamps;
+ uint16_t *heap_cpus = subscriber->u.device.heap_cpus;
+ uint16_t heap_size = subscriber->u.device.heap_size;
+
+ if (heap_size < 2)
+ return (UINT64_MAX);
+ if (heap_size == 2)
+ return (timestamps[heap_cpus[1]]);
+ return (MIN(timestamps[heap_cpus[1]], timestamps[heap_cpus[2]]));
+}
+
+/* Global eventlog state structure */
+struct eventlog_state {
+ /* Provider registry */
+ LIST_HEAD(, eventlog_provider) providers;
+ LIST_HEAD(, eventlog_provider_stats) provider_stats;
+ struct mtx providers_lock; /* Protects providers/stats lists */
+ uint16_t next_provider_id; /* Next ID to assign (1-based) */
+
+ /* System-wide device */
+ struct cdev *device;
+ smr_t smr; /* SMR domain for subscriber iter. */
+ struct mtx subscribers_mtx; /* Writer-writer add/remove excl. */
+ struct eventlog_subscriber_head subscribers;
+
+ /* UMA zones */
+ uma_zone_t session_zone;
+
+ /*
+ * Dump state. dump_tq is single-threaded so dump callbacks
+ * serialize naturally. While the TQ thread runs a callback it
+ * publishes (dump_thread, dump_target) so eventlog_event_write_impl
+ * can route the callback's events to just the requesting subscriber.
+ * No lock is held: only the TQ thread reads its own publication
+ * (curthread == dump_thread); the destroy barrier is
+ * taskqueue_drain_all() in eventlog_provider_destroy().
+ */
+ struct thread *dump_thread; /* Thread running dump callback */
+ /* Subscriber receiving dump events. */
+ struct eventlog_subscriber *dump_target;
+ struct taskqueue *dump_tq;
+};
+
+/* Single instance of global eventlog state */
+static struct eventlog_state evl = {
+ .providers = LIST_HEAD_INITIALIZER(evl.providers),
+ .provider_stats = LIST_HEAD_INITIALIZER(evl.provider_stats),
+ .device = NULL,
+ .subscribers = CK_LIST_HEAD_INITIALIZER(evl.subscribers),
+};
+
+/* Initialize mutexes and SMR */
+static void
+eventlog_state_init(void *unused)
+{
+ mtx_init(&evl.providers_lock, "eventlog providers", NULL, MTX_DEF);
+ evl.smr = smr_create("eventlog", 0, 0);
+ mtx_init(&evl.subscribers_mtx, "eventlog subscribers", NULL, MTX_DEF);
+}
+SYSINIT(eventlog_state_init, SI_SUB_LOCK, SI_ORDER_ANY,
+ eventlog_state_init, NULL);
+
+/*
+ * Start the single-threaded dump taskqueue. Serializing dump callbacks
+ * lets the (dump_thread, dump_target) publication stay lock-free.
+ */
+static void
+eventlog_dump_tq_init(void *unused)
+{
+ int err;
+
+ evl.dump_tq = taskqueue_create("eventlog_dump", M_WAITOK,
+ taskqueue_thread_enqueue, &evl.dump_tq);
+ err = taskqueue_start_threads(&evl.dump_tq, 1, PWAIT,
+ "eventlog_dump taskq");
+ if (err != 0)
+ panic("eventlog: taskqueue_start_threads failed: %d", err);
+}
+SYSINIT(eventlog_dump_tq_init, SI_SUB_TASKQ, SI_ORDER_SECOND,
+ eventlog_dump_tq_init, NULL);
+
+/* Initialize UMA zone for sessions */
+static void
+eventlog_session_zone_init(void *unused)
+{
+ evl.session_zone = uma_zcreate("eventlog_session",
+ sizeof(struct eventlog_session), NULL, NULL, NULL, NULL,
+ UMA_ALIGN_PTR, 0);
+}
+SYSINIT(eventlog_session_zone, SI_SUB_KMEM, SI_ORDER_ANY,
+ eventlog_session_zone_init, NULL);
+
+/* Forward declarations */
+static void eventlog_session_update_effective(struct eventlog_session *session,
+ struct eventlog_provider *provider);
+static void eventlog_update_provider_enablement(
+ struct eventlog_provider *provider);
+static void eventlog_subscriber_write_event(
+ struct eventlog_subscriber *subscriber,
+ struct eventlog_session *session, struct eventlog_event_header *hdr,
+ const struct iovec *iov, int iovcnt, size_t payload_size,
+ uint16_t event_length, enum eventlog_level level, uint32_t keywords);
+static void eventlog_copy_events_from_cpu(
+ struct eventlog_subscriber *subscriber,
+ struct eventlog_percpu_buffer *pcpu_buf, struct uio *uio,
+ uint64_t max_timestamp, uint64_t *next_timestamp_out,
+ bool *uio_out_of_space_out);
+static void eventlog_read_merged(struct eventlog_subscriber *subscriber,
+ struct uio *uio, uint64_t read_timestamp);
+static void eventlog_resweep_idle_cpus(struct eventlog_subscriber *subscriber,
+ uint64_t read_timestamp);
+
+/* Kernel sysctl node definitions */
+SYSCTL_DECL(_kern_eventlog);
+SYSCTL_NODE(_kern, OID_AUTO, eventlog, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
+ "Event log subsystem");
+
+/*
+ * Find existing shared statistics for a provider name.
+ * Caller must hold evl.providers_lock.
+ * Returns NULL if no stats exist for this name.
+ */
+static struct eventlog_provider_stats *
+eventlog_provider_stats_find(const char *name)
+{
+ struct eventlog_provider_stats *stats;
+
+ LIST_FOREACH(stats, &evl.provider_stats, link) {
+ if (strcmp(stats->name, name) == 0) {
+ stats->refcount++;
+ return (stats);
+ }
+ }
+ return (NULL);
+}
+
+/*
+ * Enable or disable all sessions for a single provider instance.
+ * Holds provider->sessions_lock for the entire iteration.
+ */
+static void
+eventlog_provider_set_all_sessions(struct eventlog_provider *provider,
+ int enabled)
+{
+ struct eventlog_session *session;
+
+ mtx_lock(&provider->sessions_lock);
+ LIST_FOREACH(session, &provider->sessions, link) {
+ if (session->disabled == (enabled == 0 ? 1 : 0))
+ continue;
+ counter_u64_add(provider->stats->sessions_enabled,
+ (enabled != 0) ? 1 : -1);
+ session->disabled = (enabled == 0) ? 1 : 0;
+ eventlog_session_update_effective(session, provider);
+ }
+ mtx_unlock(&provider->sessions_lock);
+}
+
+/*
+ * Sysctl handler for kern.eventlog.<name>.default.
+ * Values: 0=disabled, 1=enabled, -1=disable all active (set 0),
+ * 2=enable all disabled (set 1).
+ */
+static int
+sysctl_eventlog_default(SYSCTL_HANDLER_ARGS)
+{
+ struct eventlog_provider_stats *stats = arg1;
+ struct eventlog_provider *provider;
+ struct eventlog_provider *matched[16];
+ int nmatched, i, error, val, new_default;
+
+ val = stats->default_enabled;
+ error = sysctl_handle_int(oidp, &val, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+
+ switch (val) {
+ case -1:
+ new_default = 0;
+ break;
+ case 0:
+ case 1:
+ new_default = val;
+ break;
+ case 2:
+ new_default = 1;
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ stats->default_enabled = new_default;
+
+ nmatched = 0;
+ mtx_lock(&evl.providers_lock);
+ LIST_FOREACH(provider, &evl.providers, link) {
+ if (provider->stats == stats && nmatched < 16)
+ matched[nmatched++] = provider;
+ }
+ mtx_unlock(&evl.providers_lock);
+
+ for (i = 0; i < nmatched; i++) {
+ if (matched[i]->default_changed != NULL) {
+ matched[i]->default_changed(matched[i], val,
+ matched[i]->default_changed_arg);
+ } else if (val == -1 || val == 2) {
+ eventlog_provider_set_all_sessions(matched[i],
+ (val == 2) ? 1 : 0);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Allocate a new shared statistics structure. Does not insert into the
+ * global list — caller must do that under evl.providers_lock after
+ * re-checking for a concurrent creation. All sleeping allocations
+ * (malloc, counter_u64_alloc, sysctl) happen here, outside any lock.
+ */
+static struct eventlog_provider_stats *
+eventlog_provider_stats_alloc(const char *name, int default_enabled)
+{
+ struct eventlog_provider_stats *stats;
+ struct sysctl_oid *stats_node;
+ char tunable_name[64];
+
+ stats = malloc(sizeof(*stats), M_EVENTLOG, M_WAITOK | M_ZERO);
+ strlcpy(stats->name, name, EVENTLOG_PROVIDER_NAME_MAX);
+ stats->refcount = 1;
+ stats->default_enabled = default_enabled;
+ /*
+ * Apply the kern.eventlog.<name>.default tunable on top of the
+ * config default. TUNABLE_INT_FETCH leaves the field alone if the
+ * tunable is absent.
+ */
+ snprintf(tunable_name, sizeof(tunable_name),
+ "kern.eventlog.%s.default", name);
+ TUNABLE_INT_FETCH(tunable_name, &stats->default_enabled);
+ stats->sessions_created = counter_u64_alloc(M_WAITOK);
+ stats->sessions_active = counter_u64_alloc(M_WAITOK);
+ stats->sessions_enabled = counter_u64_alloc(M_WAITOK);
+
+ sysctl_ctx_init(&stats->sysctl_ctx);
+ stats_node = SYSCTL_ADD_NODE(&stats->sysctl_ctx,
+ SYSCTL_STATIC_CHILDREN(_kern_eventlog), OID_AUTO, name,
+ CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
+ "Event log provider statistics");
+ stats->sysctl_node = stats_node;
+ SYSCTL_ADD_COUNTER_U64(&stats->sysctl_ctx, SYSCTL_CHILDREN(stats_node),
+ OID_AUTO, "sessions_created", CTLFLAG_RD, &stats->sessions_created,
+ "Total sessions ever created successfully");
+ SYSCTL_ADD_COUNTER_U64(&stats->sysctl_ctx, SYSCTL_CHILDREN(stats_node),
+ OID_AUTO, "sessions_active", CTLFLAG_RD, &stats->sessions_active,
+ "Current active session count");
+ SYSCTL_ADD_COUNTER_U64(&stats->sysctl_ctx, SYSCTL_CHILDREN(stats_node),
+ OID_AUTO, "sessions_enabled", CTLFLAG_RD, &stats->sessions_enabled,
+ "Active sessions that are not disabled");
+ SYSCTL_ADD_PROC(&stats->sysctl_ctx, SYSCTL_CHILDREN(stats_node),
+ OID_AUTO, "default", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ stats, 0, sysctl_eventlog_default, "I",
+ "Default enabled: 0=disabled, 1=enabled, -1=disable all active, 2=enable all disabled");
+
+ return (stats);
+}
+
+/*
+ * Free a provider_stats that was never inserted into the global list
+ * (used when a concurrent creator won the race).
+ */
+static void
+eventlog_provider_stats_free(struct eventlog_provider_stats *stats)
+{
+ sysctl_ctx_free(&stats->sysctl_ctx);
+ counter_u64_free(stats->sessions_created);
+ counter_u64_free(stats->sessions_active);
+ counter_u64_free(stats->sessions_enabled);
+ free(stats, M_EVENTLOG);
+}
+
+/*
+ * Release a reference to shared provider statistics.
+ * Removes from the global list when refcount reaches zero, but does NOT
+ * free — caller must free outside the lock via eventlog_provider_stats_free.
+ * Caller must hold evl.providers_lock.
+ * Returns the stats pointer if it should be freed, NULL otherwise.
+ */
+static struct eventlog_provider_stats *
+eventlog_provider_stats_release(struct eventlog_provider_stats *stats)
+{
+ if (--stats->refcount > 0)
+ return (NULL);
+
+ LIST_REMOVE(stats, link);
+ return (stats);
+}
+
+/*
+ * Create and register a new eventlog provider.
+ */
+struct eventlog_provider*
+eventlog_provider_create(const char *name,
+ const struct eventlog_provider_config *config)
+{
+ static const struct eventlog_provider_config empty_config;
+ struct eventlog_provider *provider;
+ struct eventlog_provider_stats *new_stats = NULL;
+
+ MPASS(name != NULL);
+ MPASS(strlen(name) < EVENTLOG_PROVIDER_NAME_MAX);
+
+ if (config == NULL)
+ config = &empty_config;
+
+ /* Allocate provider structure */
+ provider = malloc(sizeof(*provider), M_EVENTLOG, M_WAITOK | M_ZERO);
+ strlcpy(provider->name, name, EVENTLOG_PROVIDER_NAME_MAX);
+ provider->name_len = strlen(provider->name);
+ provider->dump_callback = config->dump_callback;
+ provider->dump_callback_arg = config->dump_callback_arg;
+ provider->default_changed = config->default_changed;
+ provider->default_changed_arg = config->default_changed_arg;
+ provider->subscribers_changed = config->subscribers_changed;
+ provider->subscribers_changed_arg = config->subscribers_changed_arg;
+ mtx_init(&provider->sessions_lock, "eventlog sessions", NULL, MTX_DEF);
+ LIST_INIT(&provider->sessions);
+
+ /* Fast path: check if stats already exist for this name. */
+ mtx_lock(&evl.providers_lock);
+ provider->stats = eventlog_provider_stats_find(name);
+ if (provider->stats != NULL)
+ goto insert;
+ mtx_unlock(&evl.providers_lock);
+
+ /*
+ * Slow path: allocate stats outside the lock, then re-check.
+ * The first provider for a given name seeds default_enabled;
+ * later providers reuse the existing stats record (the sysctl
+ * surface is shared by name).
+ */
+ new_stats = eventlog_provider_stats_alloc(name,
+ config->default_enabled);
+
+ mtx_lock(&evl.providers_lock);
+ provider->stats = eventlog_provider_stats_find(name);
+ if (provider->stats != NULL) {
+ /* Another thread created it while we were allocating. */
+ mtx_unlock(&evl.providers_lock);
+ eventlog_provider_stats_free(new_stats);
+ mtx_lock(&evl.providers_lock);
+ } else {
+ LIST_INSERT_HEAD(&evl.provider_stats, new_stats, link);
+ provider->stats = new_stats;
+ }
+
+insert:
+ /* Assign unique provider_id (1-based; 0 reserved for invalid) */
+ if (evl.next_provider_id == 0)
+ evl.next_provider_id = 1;
+ provider->provider_id = evl.next_provider_id++;
+ LIST_INSERT_HEAD(&evl.providers, provider, link);
+ mtx_unlock(&evl.providers_lock);
+
+ return (provider);
+}
+
+/*
+ * Unregister and cleanup an eventlog provider.
+ */
+void
+eventlog_provider_destroy(struct eventlog_provider *provider)
+{
+ struct eventlog_provider_stats *dead_stats;
+
+ if (provider == NULL)
+ return;
+
+ MPASS(LIST_EMPTY(&provider->sessions));
+
+ /*
+ * Remove from the provider list first so no new subscription
+ * (and therefore no new dump task) can find us.
+ */
+ mtx_lock(&evl.providers_lock);
+ LIST_REMOVE(provider, link);
+ dead_stats = eventlog_provider_stats_release(provider->stats);
+ mtx_unlock(&evl.providers_lock);
+
+ /*
+ * Drain the dump taskqueue: queued or in-flight tasks may still
+ * reference this provider.
+ */
+ taskqueue_drain_all(evl.dump_tq);
+
+ if (dead_stats != NULL)
+ eventlog_provider_stats_free(dead_stats);
+
+ mtx_destroy(&provider->sessions_lock);
+ free(provider, M_EVENTLOG);
+}
+
+/*
+ * Create a new eventlog session.
+ * Initial enabled state is derived from the provider's default_enabled.
+ */
+struct eventlog_session*
+eventlog_session_create(struct eventlog_provider *provider,
+ uint64_t session_id, bool waitok,
+ void *create_payload, size_t create_payload_size)
+{
+ struct bintime bt;
+ struct eventlog_session *session;
+ bool enabled;
+
+ if (provider == NULL)
+ return (NULL);
+
+ session = uma_zalloc(evl.session_zone,
+ (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO);
+ if (session == NULL)
+ return (NULL);
+
+ enabled = (provider->stats->default_enabled != 0);
+
+ binuptime(&bt);
+ session->created_at = bintime2us(&bt);
+ session->provider = provider;
+ session->session_id = session_id;
+ session->disabled = enabled ? 0 : 1;
+
+ counter_u64_add(provider->stats->sessions_created, 1);
+ counter_u64_add(provider->stats->sessions_active, 1);
+ if (enabled)
+ counter_u64_add(provider->stats->sessions_enabled, 1);
+
+ /* Add session to provider's list */
+ mtx_lock(&provider->sessions_lock);
+ LIST_INSERT_HEAD(&provider->sessions, session, link);
+ eventlog_session_update_effective(session, provider);
+ mtx_unlock(&provider->sessions_lock);
+
+ /* Emit SESSION_CREATE only when enabled. */
+ if (enabled && provider->level != EVENTLOG_LEVEL_NONE) {
+ eventlog_event_write_at(session, EVENTLOG_SESSION_CREATE_ID,
+ EVENTLOG_LEVEL_INFO, EVENTLOG_KEYWORD_SESSION,
+ create_payload, create_payload_size,
+ session->created_at);
+ }
+
+ return (session);
+}
+
+/*
+ * Destroy an eventlog session.
+ */
+void
+eventlog_session_destroy(struct eventlog_session *session)
+{
+ struct eventlog_provider *provider;
+
+ if (session == NULL)
+ return;
+
+ provider = session->provider;
+ MPASS(provider != NULL);
+
+ if (session->disabled == 0) {
+ counter_u64_add(provider->stats->sessions_enabled, -1);
+ eventlog_event_write(session, EVENTLOG_SESSION_END_ID,
+ EVENTLOG_LEVEL_INFO, EVENTLOG_KEYWORD_SESSION, NULL, 0);
+ }
+
+ counter_u64_add(provider->stats->sessions_active, -1);
+
+ /* Remove session from provider's list */
+ mtx_lock(&provider->sessions_lock);
+ LIST_REMOVE(session, link);
+ mtx_unlock(&provider->sessions_lock);
+
+ /* Wait for SMR readers before freeing */
+ smr_synchronize(evl.smr);
+ uma_zfree(evl.session_zone, session);
+}
+
+/*
+ * Query provider level and keywords.
+ */
+enum eventlog_level
+eventlog_provider_get_level(struct eventlog_provider *provider)
+{
+ MPASS(provider != NULL);
+ return (provider->level);
+}
+
+uint32_t
+eventlog_provider_get_keywords(struct eventlog_provider *provider)
+{
+ MPASS(provider != NULL);
+ return (provider->keywords);
+}
+
+int
+eventlog_provider_get_default(struct eventlog_provider *provider)
+{
+ if (provider == NULL)
+ return (0);
+ return (provider->stats->default_enabled);
+}
+
+void
+eventlog_provider_set_default(struct eventlog_provider *provider, int value)
+{
+
+ MPASS(provider != NULL);
+ provider->stats->default_enabled = value;
+}
+
+/*
+ * Return the auto-generated kern.eventlog.<name> sysctl node and its
+ * context list. Children attached by providers are freed with the
+ * node, so they must not outlive the provider.
+ */
+struct sysctl_oid *
+eventlog_provider_get_sysctl_node(struct eventlog_provider *provider)
+{
+ MPASS(provider != NULL);
+ return (provider->stats->sysctl_node);
+}
+
+struct sysctl_ctx_list *
+eventlog_provider_get_sysctl_ctx(struct eventlog_provider *provider)
+{
+ MPASS(provider != NULL);
+ return (&provider->stats->sysctl_ctx);
+}
+
+/*
+ * Update session's effective_level and effective_keywords from
+ * disabled/override/provider.
+ * Caller must hold provider->sessions_lock.
+ */
+static void
+eventlog_session_update_effective(struct eventlog_session *session,
+ struct eventlog_provider *provider)
+{
+ if (session->disabled) {
+ session->effective_level = EVENTLOG_LEVEL_NONE;
+ session->effective_keywords = 0;
+ } else if (session->has_override) {
+ session->effective_level = session->override_level;
+ session->effective_keywords = session->override_keywords;
+ } else {
+ session->effective_level = provider->level;
+ session->effective_keywords = provider->keywords;
+ }
+}
+
+/*
+ * Enable or disable a session.
+ */
+void
+eventlog_session_set_enabled(struct eventlog_session *session, int enabled)
+{
+ struct eventlog_provider *provider;
+
+ if (session == NULL)
+ return;
+
+ /* No change - nothing to do */
+ if (session->disabled == (enabled == 0 ? 1 : 0))
+ return;
+
+ provider = session->provider;
+ MPASS(provider != NULL);
+
+ counter_u64_add(provider->stats->sessions_enabled,
+ (enabled != 0) ? 1 : -1);
+ session->disabled = (enabled == 0) ? 1 : 0;
+
+ mtx_lock(&provider->sessions_lock);
+ eventlog_session_update_effective(session, provider);
+ mtx_unlock(&provider->sessions_lock);
+}
+
+int
+eventlog_session_is_enabled(struct eventlog_session *session)
+{
+ return (session != NULL && session->disabled == 0);
+}
+
+/*
+ * Set per-session level/keywords override.
+ */
+void
+eventlog_session_set_filter(struct eventlog_session *session,
+ enum eventlog_level level, uint32_t keywords)
+{
+ struct eventlog_provider *provider;
+
+ if (session == NULL)
+ return;
+
+ provider = session->provider;
+ MPASS(provider != NULL);
+
+ session->has_override =
+ (level != EVENTLOG_LEVEL_NONE || keywords != 0) ? 1 : 0;
+ session->override_level = level;
+ session->override_keywords = keywords;
+
+ mtx_lock(&provider->sessions_lock);
+ eventlog_session_update_effective(session, provider);
+ mtx_unlock(&provider->sessions_lock);
+}
+
+/*
+ * Write an event directly to all relevant subscribers (internal, with
+ * explicit timestamp). The payload is a scatter/gather iovec; scalar
+ * callers pass a 1-element iov. payload_size must equal the sum of
+ * iov[*].iov_len; the caller is responsible for computing it so the
+ * hot path doesn't need to walk the iov twice.
+ */
+static void
+eventlog_event_write_impl(struct eventlog_session *session, uint32_t id,
+ enum eventlog_level level, uint32_t keywords,
+ const struct iovec *iov, int iovcnt,
+ size_t payload_size, uint64_t timestamp_us)
+{
+ struct eventlog_event_header hdr;
+ struct eventlog_provider *provider;
+ struct eventlog_subscriber *subscriber;
+ size_t total_size;
+
+ MPASS(session != NULL);
+ if (__predict_false(session == NULL))
+ return;
+
+ provider = session->provider;
+ MPASS(provider != NULL);
+
+ MPASS(iovcnt >= 0);
+ MPASS(iovcnt == 0 || iov != NULL);
+
+ total_size = sizeof(struct eventlog_event_header) + payload_size;
+
+ if (__predict_false(total_size > UINT16_MAX))
+ return;
+
+ hdr.event_length = (uint16_t)total_size;
+ hdr.RESERVED = 0;
+ hdr.timestamp = timestamp_us;
+ hdr.thread_id = (curthread != NULL) ? curthread->td_tid : 0;
+ hdr.provider_id = provider->provider_id;
+ hdr.session_id = session->session_id;
+ hdr.event_id = id;
+
+ smr_enter(evl.smr);
+ hdr.cpu = PCPU_GET(cpuid);
+
+ /*
+ * BUGBUG: It's possible other events raced on a different thread
+ * with a later timestamp and have already been written.
+ */
+
+ if (__predict_false(evl.dump_target != NULL &&
+ curthread == evl.dump_thread)) {
+ eventlog_subscriber_write_event(evl.dump_target, session,
+ &hdr, iov, iovcnt, payload_size,
+ (uint16_t)total_size, level, keywords);
+ } else {
+ CK_LIST_FOREACH(subscriber, &evl.subscribers, link) {
+ eventlog_subscriber_write_event(subscriber, session,
+ &hdr, iov, iovcnt, payload_size,
+ (uint16_t)total_size, level, keywords);
+ }
+ }
+
+ smr_exit(evl.smr);
+}
+
+void
+eventlog_event_write(struct eventlog_session *session, uint32_t id,
+ enum eventlog_level level, uint32_t keywords, void *buffer, size_t length)
+{
+ struct iovec iov = { .iov_base = buffer, .iov_len = length };
+ struct bintime bt;
+
+ binuptime(&bt);
+ eventlog_event_write_impl(session, id, level, keywords,
+ &iov, 1, length, bintime2us(&bt));
+}
+
+void
+eventlog_event_write_at(struct eventlog_session *session, uint32_t id,
+ enum eventlog_level level, uint32_t keywords, void *buffer, size_t length,
+ uint64_t timestamp_us)
+{
+ struct iovec iov = { .iov_base = buffer, .iov_len = length };
+
+ eventlog_event_write_impl(session, id, level, keywords,
+ &iov, 1, length, timestamp_us);
+}
+
+void
+eventlog_event_write_gather(struct eventlog_session *session, uint32_t id,
+ enum eventlog_level level, uint32_t keywords,
+ const struct iovec *iov, int iovcnt)
+{
+ struct bintime bt;
+ size_t payload_size = 0;
+ int i;
+
+ for (i = 0; i < iovcnt; i++)
+ payload_size += iov[i].iov_len;
+ binuptime(&bt);
+ eventlog_event_write_impl(session, id, level, keywords,
+ iov, iovcnt, payload_size, bintime2us(&bt));
+}
+
+void
+eventlog_event_write_gather_at(struct eventlog_session *session, uint32_t id,
+ enum eventlog_level level, uint32_t keywords,
+ const struct iovec *iov, int iovcnt, uint64_t timestamp_us)
+{
+ size_t payload_size = 0;
+ int i;
+
+ for (i = 0; i < iovcnt; i++)
+ payload_size += iov[i].iov_len;
+ eventlog_event_write_impl(session, id, level, keywords,
+ iov, iovcnt, payload_size, timestamp_us);
+}
+
+/*
+ * Create a new device-based subscriber with per-CPU buffers.
+ * buffer_size_per_cpu: Size of buffer to allocate per CPU.
+ * The subscriber is automatically added to the global subscribers list.
+ * Returns NULL on failure, subscriber pointer on success.
+ */
+struct eventlog_subscriber *
+eventlog_subscriber_create_device(uint32_t buffer_size_per_cpu)
+{
+ struct eventlog_subscriber *subscriber;
+ struct eventlog_percpu_buffer *percpu_buffers;
+ int cpu, maxcpu;
+
+ if (buffer_size_per_cpu < EVENTLOG_BUFFER_SIZE_MIN ||
+ buffer_size_per_cpu > EVENTLOG_BUFFER_SIZE_MAX)
+ return (NULL);
+
+ /* Allocate subscriber structure */
+ subscriber = malloc(sizeof(*subscriber), M_EVENTLOG, M_ZERO | M_WAITOK);
+ MPASS(subscriber != NULL);
+
+ CK_SLIST_INIT(&subscriber->subscriptions);
+ subscriber->type = EVENTLOG_SUBSCRIBER_TYPE_DEVICE;
+ subscriber->u.device.buffer_size_per_cpu = buffer_size_per_cpu;
+ subscriber->u.device.reader_waiting = 0;
+ subscriber->u.device.heap_size = 0;
+ mtx_init(&subscriber->dump_pending_mtx, "eventlog dump pending",
+ NULL, MTX_DEF);
+ cv_init(&subscriber->dump_pending_cv, "evl_dump");
+
+ /* Allocate per-CPU buffers */
+ maxcpu = mp_maxid + 1;
+ percpu_buffers = malloc(sizeof(*percpu_buffers) * maxcpu,
+ M_EVENTLOG, M_WAITOK | M_ZERO);
+ MPASS(percpu_buffers != NULL);
+ subscriber->u.device.percpu_buffers = percpu_buffers;
+
+ /* Allocate cpu_timestamps and heap for merge ordering */
+ subscriber->u.device.cpu_timestamps = malloc(sizeof(uint64_t) * maxcpu,
+ M_EVENTLOG, M_WAITOK | M_ZERO);
+ MPASS(subscriber->u.device.cpu_timestamps != NULL);
+ subscriber->u.device.heap_cpus = malloc(sizeof(uint16_t) * maxcpu,
+ M_EVENTLOG, M_WAITOK | M_ZERO);
+ MPASS(subscriber->u.device.heap_cpus != NULL);
+ for (cpu = 0; cpu < maxcpu; cpu++)
+ subscriber->u.device.cpu_timestamps[cpu] =
+ EVENTLOG_TIMESTAMP_NONE;
+
+ /* Allocate reader/writer buffers for each CPU */
+ for (cpu = 0; cpu < maxcpu; cpu++) {
+ percpu_buffers[cpu].buffer_size = buffer_size_per_cpu;
+ percpu_buffers[cpu].packed_state = EVTLOG_SWAP_ALLOWED;
+#ifndef EVENTLOG_HAS_ATOMIC64
+ mtx_init(&percpu_buffers[cpu].swap_lock,
+ "eventlog swap", NULL, MTX_SPIN);
+#endif
+ percpu_buffers[cpu].buffers[0] = malloc(buffer_size_per_cpu,
+ M_EVENTLOG, M_WAITOK | M_ZERO);
+ MPASS(percpu_buffers[cpu].buffers[0] != NULL);
+ percpu_buffers[cpu].buffers[1] = malloc(buffer_size_per_cpu,
+ M_EVENTLOG, M_WAITOK | M_ZERO);
+ MPASS(percpu_buffers[cpu].buffers[1] != NULL);
+ }
+
+ /* Add subscriber to global list */
+ mtx_lock(&evl.subscribers_mtx);
+ CK_LIST_INSERT_HEAD(&evl.subscribers, subscriber, link);
+ mtx_unlock(&evl.subscribers_mtx);
+
+ return (subscriber);
+}
+
+/*
+ * Create a new callback-based subscriber.
+ * callback: Function to call when events arrive.
+ * callback_arg: Argument to pass to callback function.
+ * The subscriber is automatically added to the global subscribers list.
+ * Returns NULL on failure, subscriber pointer on success.
+ */
+struct eventlog_subscriber *
+eventlog_subscriber_create_callback(eventlog_callback_t callback,
+ void *callback_arg)
+{
+ struct eventlog_subscriber *subscriber;
+
+ MPASS(callback != NULL);
+
+ /* Allocate subscriber structure */
+ subscriber = malloc(sizeof(*subscriber), M_EVENTLOG, M_ZERO | M_WAITOK);
+ MPASS(subscriber != NULL);
+
+ CK_SLIST_INIT(&subscriber->subscriptions);
+ subscriber->type = EVENTLOG_SUBSCRIBER_TYPE_CALLBACK;
+ subscriber->u.callback.callback = callback;
+ subscriber->u.callback.callback_arg = callback_arg;
+ mtx_init(&subscriber->dump_pending_mtx, "eventlog dump pending",
+ NULL, MTX_DEF);
+ cv_init(&subscriber->dump_pending_cv, "evl_dump");
+
+ /* Add subscriber to global list */
+ mtx_lock(&evl.subscribers_mtx);
+ CK_LIST_INSERT_HEAD(&evl.subscribers, subscriber, link);
+ mtx_unlock(&evl.subscribers_mtx);
+
+ return (subscriber);
+}
+
+/*
+ * Async dump_state machinery. One eventlog_dump_task per (subscriber,
+ * provider) pair is enqueued on evl.dump_tq; the TQ thread publishes
+ * (dump_thread, dump_target), invokes provider->dump_callback, then
+ * decrements subscriber->dump_pending and signals dump_pending_cv.
+ *
+ * Subscriber and provider pointers in the task are kept alive by their
+ * destroy paths draining the TQ before freeing memory.
+ */
+struct eventlog_dump_task {
+ struct task task;
+ struct eventlog_subscriber *subscriber;
+ struct eventlog_provider *provider;
+};
+
+/*
+ * Forward declarations for eventlog_emit_dump_complete(); definitions
+ * are further down with the rest of the subscriber write path.
+ */
+static void eventlog_subscriber_write_event_device(
+ struct eventlog_subscriber *subscriber,
+ struct eventlog_provider *provider, uint64_t session_id,
+ struct eventlog_event_header *hdr, const struct iovec *iov, int iovcnt,
+ size_t payload_size);
+static void eventlog_subscriber_write_event_callback(
+ struct eventlog_subscriber *subscriber,
+ struct eventlog_provider *provider, uint64_t session_id,
+ struct eventlog_event_header *hdr, const struct iovec *iov, int iovcnt,
+ size_t payload_size);
+
+/*
+ * Synthesise an EVENTLOG_DUMP_COMPLETE_ID event for `subscriber` once
+ * `provider`'s dump_callback has finished. session_id is
+ * EVENTLOG_SESSION_ID_NONE; the level/keyword filter matches
+ * SESSION_CREATE/SESSION_END.
+ */
+static void
+eventlog_emit_dump_complete(struct eventlog_provider *provider,
+ struct eventlog_subscriber *subscriber)
+{
+ struct eventlog_event_header hdr;
+ struct eventlog_subscription *sub;
+ struct iovec iov = { .iov_base = NULL, .iov_len = 0 };
+ struct bintime bt;
+ bool match = false;
+
+ binuptime(&bt);
+ hdr.event_length = (uint16_t)sizeof(hdr);
+ hdr.RESERVED = 0;
+ hdr.timestamp = bintime2us(&bt);
+ hdr.thread_id = (curthread != NULL) ? curthread->td_tid : 0;
+ hdr.provider_id = provider->provider_id;
+ hdr.session_id = EVENTLOG_SESSION_ID_NONE;
+ hdr.event_id = EVENTLOG_DUMP_COMPLETE_ID;
+
+ smr_enter(evl.smr);
+ hdr.cpu = PCPU_GET(cpuid);
+
+ CK_SLIST_FOREACH(sub, &subscriber->subscriptions, link) {
+ if (sub->provider == provider) {
+ if (EVENTLOG_LEVEL_INFO <= sub->level &&
+ (sub->keywords & EVENTLOG_KEYWORD_SESSION) != 0)
+ match = true;
+ break;
+ }
+ }
+
+ if (match) {
+ if (subscriber->type == EVENTLOG_SUBSCRIBER_TYPE_DEVICE) {
+ eventlog_subscriber_write_event_device(subscriber,
+ provider, EVENTLOG_SESSION_ID_NONE, &hdr, &iov, 0,
+ 0);
+ } else {
+ eventlog_subscriber_write_event_callback(subscriber,
+ provider, EVENTLOG_SESSION_ID_NONE, &hdr, &iov, 0,
+ 0);
+ }
+ }
+
+ smr_exit(evl.smr);
+}
+
+static void
+eventlog_dump_task_handler(void *context, int pending __unused)
+{
+ struct eventlog_dump_task *dt = context;
+ struct eventlog_subscriber *subscriber = dt->subscriber;
+ struct eventlog_provider *provider = dt->provider;
+
+ /*
+ * No lock around the publication: the single-threaded TQ is the
+ * only writer; other threads' curthread != dump_thread so they
+ * always take the normal subscriber-fanout path regardless of
+ * any torn read.
+ */
+ evl.dump_thread = curthread;
+ evl.dump_target = subscriber;
+ provider->dump_callback(provider, provider->dump_callback_arg);
+ eventlog_emit_dump_complete(provider, subscriber);
+ evl.dump_target = NULL;
+ evl.dump_thread = NULL;
+
+ mtx_lock(&subscriber->dump_pending_mtx);
+ KASSERT(subscriber->dump_pending > 0,
+ ("eventlog: dump_pending underflow on %p", subscriber));
+ if (--subscriber->dump_pending == 0)
+ cv_broadcast(&subscriber->dump_pending_cv);
+ mtx_unlock(&subscriber->dump_pending_mtx);
+
+ free(dt, M_EVENTLOG);
+}
+
+/*
+ * Block until every dump_state task outstanding for this subscriber
+ * (queued or running) has finished.
+ */
+void
+eventlog_subscriber_drain_dumps(struct eventlog_subscriber *subscriber)
+{
+
+ if (subscriber == NULL)
+ return;
+
+ mtx_lock(&subscriber->dump_pending_mtx);
+ while (subscriber->dump_pending > 0)
+ cv_wait(&subscriber->dump_pending_cv,
+ &subscriber->dump_pending_mtx);
+ mtx_unlock(&subscriber->dump_pending_mtx);
+}
+
+/*
+ * Destroy a subscriber and update provider enablement.
+ */
+void
+eventlog_subscriber_destroy(struct eventlog_subscriber *subscriber)
+{
+ struct eventlog_subscription *sub, *sub_next;
+
+ if (subscriber == NULL)
+ return;
+
+ /*
+ * Drain dump tasks first; they reference this subscriber's
+ * buffers and would UAF if we freed them mid-callback.
+ */
+ eventlog_subscriber_drain_dumps(subscriber);
+
+ /* Remove subscriber from global list */
+ mtx_lock(&evl.subscribers_mtx);
+ CK_LIST_REMOVE(subscriber, link);
+ mtx_unlock(&evl.subscribers_mtx);
+
+ /* Update all provider enablements (we're no longer visible) */
+ CK_SLIST_FOREACH(sub, &subscriber->subscriptions, link) {
+ eventlog_update_provider_enablement(sub->provider);
+ }
+
+ /* Wait for all SMR readers before freeing */
+ smr_synchronize(evl.smr);
+
+ /* Free subscriptions, buffers, and subscriber */
+ CK_SLIST_FOREACH_SAFE(sub, &subscriber->subscriptions, link, sub_next) {
+ free(sub, M_EVENTLOG);
+ }
+
+ /* Clean up subscriber based on type */
+ if (subscriber->type == EVENTLOG_SUBSCRIBER_TYPE_DEVICE) {
+ int cpu, maxcpu = mp_maxid + 1;
+ struct eventlog_percpu_buffer *percpu_buffers =
+ subscriber->u.device.percpu_buffers;
+
+ if (percpu_buffers != NULL) {
+ for (cpu = 0; cpu < maxcpu; cpu++) {
+ if (percpu_buffers[cpu].buffers[0] != NULL)
+ free(percpu_buffers[cpu].buffers[0],
+ M_EVENTLOG);
+ if (percpu_buffers[cpu].buffers[1] != NULL)
+ free(percpu_buffers[cpu].buffers[1],
+ M_EVENTLOG);
+#ifndef EVENTLOG_HAS_ATOMIC64
+ mtx_destroy(&percpu_buffers[cpu].swap_lock);
+#endif
+ }
+ free(percpu_buffers, M_EVENTLOG);
+ }
+ if (subscriber->u.device.cpu_timestamps != NULL)
+ free(subscriber->u.device.cpu_timestamps, M_EVENTLOG);
+ if (subscriber->u.device.heap_cpus != NULL)
+ free(subscriber->u.device.heap_cpus, M_EVENTLOG);
+ }
+ /* Callback subscribers don't need cleanup */
+
+ cv_destroy(&subscriber->dump_pending_cv);
+ mtx_destroy(&subscriber->dump_pending_mtx);
+ free(subscriber, M_EVENTLOG);
+}
+
+/*
+ * Subscribe to a single provider. Handles both new subscriptions and
+ * updating existing ones.
+ *
+ * On a brand-new subscription (not an in-place update) and only when
+ * the provider has a dump_callback, enqueue one task on evl.dump_tq
+ * so the provider can replay current state. Re-subscribing does not
+ * re-fire the dump.
+ */
+static void
+eventlog_subscriber_add_subscription_one(struct eventlog_subscriber *subscriber,
+ struct eventlog_provider *provider, enum eventlog_level level,
+ uint32_t keywords, uint32_t flags)
+{
+ struct eventlog_subscription *sub, *new_sub;
+ struct eventlog_dump_task *dt;
+ bool newly_subscribed = false;
+
+ new_sub = malloc(sizeof(*new_sub), M_EVENTLOG, M_WAITOK);
+ MPASS(new_sub != NULL);
+ new_sub->provider = provider;
+ new_sub->level = level;
+ new_sub->keywords = keywords;
+
+ mtx_lock(&evl.subscribers_mtx);
+
+ CK_SLIST_FOREACH(sub, &subscriber->subscriptions, link) {
+ if (sub->provider == provider) {
+ /* Already subscribed; update in place. */
+ sub->level = level;
+ sub->keywords = keywords;
+ mtx_unlock(&evl.subscribers_mtx);
+ free(new_sub, M_EVENTLOG);
+ goto update_enablement;
+ }
+ }
+
+ CK_SLIST_INSERT_HEAD(&subscriber->subscriptions, new_sub, link);
+ newly_subscribed = true;
+
+ mtx_unlock(&evl.subscribers_mtx);
+
+update_enablement:
+
+ /* Update provider enablement */
+ eventlog_update_provider_enablement(provider);
+
+ if (!newly_subscribed || provider->dump_callback == NULL ||
+ (flags & EVENTLOG_SUBSCRIPTION_DUMP_STATE) == 0)
+ return;
+
+ /*
+ * First-time subscribe + dump_callback + DUMP_STATE flag:
+ * enqueue an async dump. Bumping dump_pending under the
+ * subscriber's mtx ensures a racing destroy() either sees the
+ * pending count and waits, or finds none yet and our task is
+ * still scheduled to fire after subscribe returns.
+ *
+ * M_NOWAIT: on failure skip the dump rather than block subscribe;
+ * the live event stream is still delivered.
+ */
+ dt = malloc(sizeof(*dt), M_EVENTLOG, M_NOWAIT);
+ if (dt == NULL)
+ return;
+ TASK_INIT(&dt->task, 0, eventlog_dump_task_handler, dt);
+ dt->subscriber = subscriber;
+ dt->provider = provider;
+
+ mtx_lock(&subscriber->dump_pending_mtx);
+ subscriber->dump_pending++;
+ mtx_unlock(&subscriber->dump_pending_mtx);
+
+ taskqueue_enqueue(evl.dump_tq, &dt->task);
+}
+
+/*
+ * Add a subscription to a subscriber.
+ * Subscribes to ALL providers matching provider_name (multiple providers
+ * may share the same name, e.g., different TCP stacks each registering "tcp").
+ * Returns 0 on success, error code on failure.
+ *
+ * `flags` is a bitmask of EVENTLOG_SUBSCRIPTION_* values; unknown bits
+ * return EINVAL. With EVENTLOG_SUBSCRIPTION_DUMP_STATE set, every
+ * newly-subscribed provider with a dump_callback gets an asynchronous
+ * dump enqueued; eventlog_subscriber_drain_dumps() waits for them.
+ */
+int
+eventlog_subscriber_add_subscription(struct eventlog_subscriber *subscriber,
+ const char *provider_name, enum eventlog_level level, uint32_t keywords,
+ uint32_t flags)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_provider *matched[EVENTLOG_MAX_PROVIDERS];
+ int nmatched = 0;
+ int i;
+
+ MPASS(subscriber != NULL);
+ MPASS(provider_name != NULL);
+
+ if ((flags & ~EVENTLOG_SUBSCRIPTION_FLAGS_VALID) != 0)
+ return (EINVAL);
+
+ /* Find all providers matching the name */
+ mtx_lock(&evl.providers_lock);
+ LIST_FOREACH(provider, &evl.providers, link) {
+ if (strcmp(provider->name, provider_name) == 0 &&
+ nmatched < EVENTLOG_MAX_PROVIDERS)
+ matched[nmatched++] = provider;
+ }
+ mtx_unlock(&evl.providers_lock);
+
+ if (nmatched == 0)
+ /* TODO: Support subscribing before provider is registered. */
+ return (ENOENT);
+
+ for (i = 0; i < nmatched; i++)
+ eventlog_subscriber_add_subscription_one(subscriber,
+ matched[i], level, keywords, flags);
+
+ return (0);
+}
+
+/*
+ * Update provider enablement based on all active subscribers.
+ * Keywords are OR'ed, level is MAX (most verbose) of all subscribers.
+ *
+ * sessions_lock is held across the recount and per-session update so
+ * the subscribers_changed callback fires exactly once per real 0<->N
+ * edge. The callback runs after the lock is dropped.
+ */
+static void
+eventlog_update_provider_enablement(struct eventlog_provider *provider)
+{
+ struct eventlog_session *session;
+ struct eventlog_subscriber *subscriber;
+ struct eventlog_subscription *sub;
+ enum eventlog_level max_level = EVENTLOG_LEVEL_NONE;
+ uint32_t or_keywords = 0;
+ bool has_subscribers = false;
+ bool transitioned = false;
+
+ MPASS(provider != NULL);
+
+ mtx_lock(&provider->sessions_lock);
+
+ smr_enter(evl.smr);
+ CK_LIST_FOREACH(subscriber, &evl.subscribers, link) {
+ CK_SLIST_FOREACH(sub, &subscriber->subscriptions, link) {
+ if (sub->provider == provider) {
+ has_subscribers = true;
+ or_keywords |= sub->keywords;
+ if (sub->level > max_level)
+ max_level = sub->level;
+ }
+ }
+ }
+ smr_exit(evl.smr);
+
+ if (provider->has_subscribers != has_subscribers) {
+ provider->has_subscribers = has_subscribers;
+ transitioned = true;
+ }
+
+ /* Update provider enablement */
+ if (has_subscribers) {
+ provider->keywords = or_keywords;
+ provider->level = max_level;
+ } else {
+ /* No subscribers - disable provider */
+ provider->keywords = 0;
+ provider->level = EVENTLOG_LEVEL_NONE;
+ }
+
+ /* Update all sessions' effective values */
+ LIST_FOREACH(session, &provider->sessions, link) {
+ eventlog_session_update_effective(session, provider);
+ }
+ mtx_unlock(&provider->sessions_lock);
+
+ if (transitioned && provider->subscribers_changed != NULL) {
+ provider->subscribers_changed(provider, has_subscribers,
+ provider->subscribers_changed_arg);
+ }
+}
+
+/*
+ * Swap buffers for a single CPU if the reader buffer is empty and the
+ * active buffer has data. Returns true if data is available in the
+ * reader buffer (either from a swap we performed or a proactive writer
+ * swap that already completed); false if there is nothing to read.
+ *
+ * The swap can lose its CAS to a concurrent writer commit or proactive
+ * swap, so we loop, re-checking swap_allowed and commit_pos each time.
+ */
+static EVENTLOG_INLINING bool
+eventlog_swap_cpu_buffer_if_needed(struct eventlog_percpu_buffer *pcpu_buf,
+ int cpu)
+{
+ evtlog_state_t state;
+
+ state = evtlog_load_state(pcpu_buf);
+ while (1) {
+ if (!evtlog_state_swap_allowed(state)) {
+ MPASS(evtlog_state_reader_len(state) > 0);
+ return (true);
+ }
+
+ if (evtlog_state_commit_pos(state) == 0)
+ return (false);
+
+ if (evtlog_try_swap(pcpu_buf, &state))
+ return (true);
+ /* Lost the swap CAS to a peer; *state is refreshed, retry. */
+ }
+}
+
+/*
+ * Swap buffers for all CPUs if reader buffer is empty and active buffer
+ * has data. Builds/preserves the merge heap (min-heap by timestamp) of
+ * CPUs that have data. CPUs already in the list from a previous call
+ * have data and are skipped (no swap,
+ * no reinsert).
+ */
+static void
+eventlog_swap_buffers_if_needed(struct eventlog_subscriber *subscriber)
+{
+ int cpu;
+ struct eventlog_percpu_buffer *pcpu_buf;
+ uint64_t *timestamps = subscriber->u.device.cpu_timestamps;
+
+ MPASS(subscriber->type == EVENTLOG_SUBSCRIBER_TYPE_DEVICE);
+
+ for (cpu = 0; cpu <= mp_maxid; cpu++) {
+ if (timestamps[cpu] < EVENTLOG_TIMESTAMP_SWEPT)
+ continue; /* In heap */
+ pcpu_buf = &subscriber->u.device.percpu_buffers[cpu];
+ if (eventlog_swap_cpu_buffer_if_needed(pcpu_buf, cpu))
+ eventlog_heap_insert(subscriber, (uint16_t)cpu,
+ eventlog_peek_next_timestamp(pcpu_buf));
+ else
+ timestamps[cpu] = EVENTLOG_TIMESTAMP_NONE;
+ }
+}
+
+/*
+ * Read events from a device subscriber's buffer.
+ * Handles both user-space (UIO_USERSPACE) and kernel (UIO_SYSSPACE) uio.
+ */
+int
+eventlog_subscriber_read(struct eventlog_subscriber *subscriber,
+ struct uio *uio, int flags)
+{
+ struct bintime bt;
+ uint64_t read_timestamp;
+ int error = 0;
+
+ MPASS(subscriber != NULL);
+ MPASS(subscriber->type == EVENTLOG_SUBSCRIBER_TYPE_DEVICE);
+ MPASS(uio != NULL);
+
+ if (uio->uio_iovcnt != 1 || uio->uio_resid == 0)
+ return (EOPNOTSUPP); /* Only one iovec supported */
+
+ /* Swap to get latest data, then check if we have anything to read. */
+ eventlog_swap_buffers_if_needed(subscriber);
+
+ if (subscriber->u.device.heap_size == 0) {
+ if (flags & FNONBLOCK)
+ return (EAGAIN);
+
+ /* Wait for writers to produce data. */
+ atomic_store_rel_32(&subscriber->u.device.reader_waiting, 1);
+ error = tsleep(subscriber, PCATCH, "evtlogrd", hz);
+ atomic_store_rel_32(&subscriber->u.device.reader_waiting, 0);
+ if (error != 0 && error != EWOULDBLOCK)
+ return (error);
+
+ eventlog_swap_buffers_if_needed(subscriber);
+ if (subscriber->u.device.heap_size == 0)
+ return (EAGAIN);
+ }
+
+ binuptime(&bt);
+ read_timestamp = bintime2us(&bt);
+
+ eventlog_read_merged(subscriber, uio, read_timestamp);
+ return (0);
+}
+
+/*
+ * Re-sweep CPUs not in the heap after hitting a timestamp boundary.
+ * Picks up events from preempted writers that committed before read_timestamp
+ * but whose CPU was previously extracted (no data at extraction time).
+ */
+static void
+eventlog_resweep_idle_cpus(struct eventlog_subscriber *subscriber,
+ uint64_t read_timestamp)
+{
+ int cpu;
+ struct eventlog_percpu_buffer *pcpu_buf;
+ uint64_t *timestamps = subscriber->u.device.cpu_timestamps;
+
+ for (cpu = 0; cpu <= mp_maxid; cpu++) {
+ if (timestamps[cpu] != EVENTLOG_TIMESTAMP_NONE)
+ continue; /* In heap or already swept */
+ pcpu_buf = &subscriber->u.device.percpu_buffers[cpu];
+ if (eventlog_swap_cpu_buffer_if_needed(pcpu_buf, cpu)) {
+ uint64_t ts = eventlog_peek_next_timestamp(pcpu_buf);
+ if (ts <= read_timestamp) {
+ eventlog_heap_insert(subscriber, (uint16_t)cpu,
+ ts);
+ continue;
+ }
+ }
+ timestamps[cpu] = EVENTLOG_TIMESTAMP_SWEPT;
+ }
+}
+
+/*
+ * Merge events from all CPUs in timestamp order, copying via uio.
+ * Events with timestamps beyond read_timestamp are deferred to the next read.
+ * Caller must have called eventlog_swap_buffers_if_needed beforehand.
+ */
+static EVENTLOG_INLINING void
+eventlog_read_merged(struct eventlog_subscriber *subscriber, struct uio *uio,
+ uint64_t read_timestamp)
+{
+ struct eventlog_percpu_buffer *pcpu_buf;
+ uint64_t *timestamps = subscriber->u.device.cpu_timestamps;
+
+ MPASS(subscriber->type == EVENTLOG_SUBSCRIBER_TYPE_DEVICE);
+ MPASS(subscriber->u.device.heap_size > 0);
+
+ /* Take lowest timestamp, copy from that CPU, reinsert when drained. */
+ while (uio->uio_resid > 0 && subscriber->u.device.heap_size > 0) {
+ uint16_t current_cpu = subscriber->u.device.heap_cpus[0];
+ uint64_t max_timestamp, effective_max, next_timestamp;
+ bool uio_out_of_space;
+
+ pcpu_buf = &subscriber->u.device.percpu_buffers[current_cpu];
+ max_timestamp = eventlog_heap_second_min_timestamp(subscriber);
+ effective_max = (max_timestamp < read_timestamp) ?
+ max_timestamp : read_timestamp;
+
+ eventlog_copy_events_from_cpu(subscriber, pcpu_buf, uio,
+ effective_max, &next_timestamp, &uio_out_of_space);
+
+ if (uio_out_of_space)
+ break;
+
+ EVENTLOG_VALIDATE_READER(pcpu_buf);
+ if (evtlog_state_reader_len(evtlog_load_state(pcpu_buf)) ==
+ pcpu_buf->read_pos) {
+ MPASS(next_timestamp == 0);
+
+ /*
+ * Reader buffer fully drained. Atomically clear
+ * reader_len and set swap_allowed in one CAS so
+ * the upper-32-bit and lower-32-bit updates are
+ * inseparable from concurrent writer commits.
+ */
+ evtlog_drain_complete(pcpu_buf);
+
+ if (eventlog_swap_cpu_buffer_if_needed(pcpu_buf,
+ current_cpu)) {
+ /*
+ * Single CPU swapped; update timestamp and
+ * possibly reinsert.
+ */
+ next_timestamp =
+ eventlog_peek_next_timestamp(pcpu_buf);
+ if (next_timestamp > read_timestamp) {
+ eventlog_heap_extract_min(subscriber);
+ eventlog_resweep_idle_cpus(subscriber,
+ read_timestamp);
+ continue;
+ }
+ if (next_timestamp <= max_timestamp) {
+ timestamps[current_cpu] =
+ next_timestamp;
+ continue;
+ }
+ /* No longer min; update root and sift down. */
+ eventlog_heap_update_root(subscriber,
+ next_timestamp);
+ } else {
+ /* Buffer drained, no swap: remove from heap. */
+ eventlog_heap_extract_min(subscriber);
+ }
+ continue;
+ }
+
+ if (next_timestamp > read_timestamp) {
+ /* Remaining events are past the epoch boundary. */
+ eventlog_heap_extract_min(subscriber);
+ eventlog_resweep_idle_cpus(subscriber, read_timestamp);
+ continue;
+ }
+
+ /* Buffer has more data within epoch: update root and sift. */
+ MPASS(next_timestamp != 0);
+ eventlog_heap_update_root(subscriber, next_timestamp);
+ }
+}
+
+/*
+ * Copy events from a CPU buffer up to a given timestamp threshold.
+ * UIO_USERSPACE uses copyout; UIO_SYSSPACE uses bcopy directly.
+ * Stops if we run out of space.
+ */
+static EVENTLOG_INLINING void
+eventlog_copy_events_from_cpu(
+ struct eventlog_subscriber *subscriber,
+ struct eventlog_percpu_buffer *pcpu_buf, struct uio *uio,
+ uint64_t max_timestamp, uint64_t *next_timestamp_out,
+ bool *uio_out_of_space_out)
+{
+ uint32_t bytes_consumed = 0;
+ uint64_t next_timestamp;
+ evtlog_state_t cur_state = evtlog_load_state(pcpu_buf);
+ int reader = 1 - evtlog_state_active(cur_state);
+ size_t space_avail = uio->uio_resid;
+ uint32_t available = evtlog_state_reader_len(cur_state) -
+ pcpu_buf->read_pos;
+
+ MPASS(pcpu_buf != NULL);
+ MPASS(uio != NULL);
+ MPASS(next_timestamp_out != NULL);
+ MPASS(uio_out_of_space_out != NULL);
+ EVENTLOG_VALIDATE_READER(pcpu_buf);
+
+ *uio_out_of_space_out = false;
+
+ /* Scan events to compute contiguous batch within max_timestamp. */
+ do {
+ struct eventlog_event_header hdr;
+ uint32_t offset = pcpu_buf->read_pos + bytes_consumed;
+
+ MPASS((available - bytes_consumed) >=
+ sizeof(struct eventlog_event_header));
+ MPASS(offset < pcpu_buf->buffer_size);
+ memcpy(&hdr, (uint8_t *)pcpu_buf->buffers[reader] + offset,
+ sizeof(struct eventlog_event_header));
+
+ MPASS(hdr.event_length >= sizeof(struct eventlog_event_header));
+ MPASS(hdr.event_length <= (available - bytes_consumed));
+ MPASS(offset + hdr.event_length <= pcpu_buf->buffer_size);
+ MPASS((available - bytes_consumed - hdr.event_length) == 0 ||
+ (available - bytes_consumed - hdr.event_length)
+ >= sizeof(struct eventlog_event_header));
+
+ next_timestamp = hdr.timestamp;
+
+ if (next_timestamp > max_timestamp)
+ break;
+
+ if (bytes_consumed + hdr.event_length > space_avail) {
+ *uio_out_of_space_out = true;
+ break;
+ }
+
+ bytes_consumed += hdr.event_length;
+
+ } while (available > bytes_consumed);
+
+ /* Copy the data into the uio buffer. */
+ if (bytes_consumed > 0) {
+ const char *src;
+
+ src = (char *)((uint8_t *)pcpu_buf->buffers[reader] +
+ pcpu_buf->read_pos);
+
+ if (uio->uio_segflg == UIO_USERSPACE) {
+ KASSERT(THREAD_CAN_SLEEP(),
+ ("eventlog copyout in non-sleepable context"));
+ if (copyout(src, uio->uio_iov[0].iov_base,
+ bytes_consumed) != 0) {
+ *uio_out_of_space_out = true;
+ goto out;
+ }
+ uioadvance(uio, bytes_consumed);
+ } else {
+ bcopy(src, uio->uio_iov[0].iov_base, bytes_consumed);
+ uioadvance(uio, bytes_consumed);
+ }
+
+ pcpu_buf->read_pos += bytes_consumed;
+ EVENTLOG_VALIDATE_READER(pcpu_buf);
+
+ if (pcpu_buf->read_pos ==
+ evtlog_state_reader_len(evtlog_load_state(pcpu_buf)))
+ next_timestamp = 0;
+ }
+
+out:
+ *next_timestamp_out = next_timestamp;
+}
+
+/*
+ * Write an event to a device-based subscriber's per-CPU buffer. Format:
+ * header (includes provider_id, session_id, event_id) + payload.
+ *
+ * Implements the writer side of the SYNC MODEL at the top of this file;
+ * the four numbered steps below correspond to (1)-(4) in that comment.
+ *
+ * Wakes the reader only after a proactive swap (a full buffer's worth of
+ * data just moved into the reader buffer). Normal commits do not wake;
+ * the reader is woken in batches.
+ */
+static EVENTLOG_INLINING void
+eventlog_subscriber_write_event_device(struct eventlog_subscriber *subscriber,
+ struct eventlog_provider *provider, uint64_t session_id,
+ struct eventlog_event_header *hdr, const struct iovec *iov, int iovcnt,
+ size_t payload_size)
+{
+ struct eventlog_percpu_buffer *pcpu_buf;
+ uint8_t *buf;
+ int active;
+ uint32_t commit_pos;
+ evtlog_state_t state;
+ size_t event_len = hdr->event_length;
+ bool did_swap = false;
+ int i;
+
+ MPASS(subscriber != NULL);
+ MPASS(provider != NULL);
+ MPASS(hdr != NULL);
+ MPASS(subscriber->type == EVENTLOG_SUBSCRIBER_TYPE_DEVICE);
+ MPASS(hdr->cpu >= 0 && hdr->cpu <= mp_maxid);
+#ifdef INVARIANTS
+ size_t expected_length = sizeof(struct eventlog_event_header) +
+ payload_size;
+ MPASS(hdr->event_length == expected_length);
+#endif
+
+ pcpu_buf = &subscriber->u.device.percpu_buffers[hdr->cpu];
+ MPASS(event_len <= pcpu_buf->buffer_size);
+
+#ifndef EVENTLOG_HAS_ATOMIC64
+ /* NMI-on-lock-holder deadlock guard; see SYNC MODEL. */
+ if (__predict_false(mtx_owned(&pcpu_buf->swap_lock))) {
+ atomic_add_long(&subscriber->dropped_events, 1);
+ return;
+ }
+#endif
+
+ /* (1) Load state to get active buffer and write offset. */
+ state = evtlog_load_state(pcpu_buf);
+ active = evtlog_state_active(state);
+ commit_pos = evtlog_state_commit_pos(state);
+
+write:
+ /*
+ * (2) Check capacity (re-derived every retry: an NMI or peer may
+ * have advanced commit_pos since we last loaded it).
+ */
+ if (__predict_false(commit_pos + event_len > pcpu_buf->buffer_size)) {
+ if (!did_swap && evtlog_state_swap_allowed(state)) {
+ evtlog_try_swap(pcpu_buf, &state);
+ /*
+ * *state holds the post-swap packed state regardless
+ * of who won; re-derive active/commit_pos from it so
+ * we never write at offset 0 over a peer's event.
+ */
+ active = evtlog_state_active(state);
+ commit_pos = evtlog_state_commit_pos(state);
+ did_swap = true;
+ if (__predict_false(commit_pos + event_len >
+ pcpu_buf->buffer_size)) {
+ /*
+ * No room after the swap; a same-CPU NMI
+ * writer filled the new buffer. Drop.
+ */
+ atomic_add_long(&subscriber->dropped_events, 1);
+ return;
+ }
+ } else {
+ atomic_add_long(&subscriber->dropped_events, 1);
+ return;
+ }
+ }
+
+ /* (3) Write: copy header then iov segments at commit_pos. */
+ buf = (uint8_t *)pcpu_buf->buffers[active] + commit_pos;
+ memcpy(buf, hdr, sizeof(struct eventlog_event_header));
+ buf += sizeof(struct eventlog_event_header);
+ for (i = 0; i < iovcnt; i++) {
+ if (iov[i].iov_len > 0) {
+ memcpy(buf, iov[i].iov_base, iov[i].iov_len);
+ buf += iov[i].iov_len;
+ }
+ }
+
+ /*
+ * (4) Commit: CAS to advance commit_pos. If active or commit_pos
+ * moved (peer swap or NMI commit), our memcpy is at a stale
+ * offset and we redo the write via `goto write`. Reader drain
+ * only moves the upper bits or swap_allowed; the memcpy stays
+ * valid and we just retry the CAS.
+ */
+ while (__predict_false(!evtlog_try_commit(pcpu_buf, &state,
+ (uint32_t)event_len))) {
+ if (evtlog_state_active(state) != active ||
+ evtlog_state_commit_pos(state) != commit_pos) {
+ active = evtlog_state_active(state);
+ commit_pos = evtlog_state_commit_pos(state);
+ goto write;
+ }
+ }
+
+ /*
+ * Wake reader only after a proactive swap - a full buffer's worth
+ * of data is now in the reader buffer.
+ */
+ if (did_swap &&
+ atomic_cmpset_32(&subscriber->u.device.reader_waiting, 1, 0))
+ wakeup(subscriber);
+}
+
+/*
+ * Deliver an event to a callback subscriber. The payload is passed as
+ * the same scatter/gather iovec the write path carries internally;
+ * callbacks that need a flat payload compact it themselves.
+ */
+static EVENTLOG_INLINING void
+eventlog_subscriber_write_event_callback(
+ struct eventlog_subscriber *subscriber,
+ struct eventlog_provider *provider, uint64_t session_id,
+ struct eventlog_event_header *hdr, const struct iovec *iov, int iovcnt,
+ size_t payload_size)
+{
+ MPASS(subscriber->type == EVENTLOG_SUBSCRIBER_TYPE_CALLBACK);
+ MPASS(subscriber->u.callback.callback != NULL);
+
+ subscriber->u.callback.callback(hdr, provider->name,
+ provider->name_len, session_id, iov, iovcnt, payload_size,
+ subscriber->u.callback.callback_arg);
+}
+
+/*
+ * Write an event to a subscriber.
+ * Checks if subscriber has matching subscription and level/keywords match.
+ * Routes to device or callback handler based on subscriber type.
+ */
+static void
+eventlog_subscriber_write_event(struct eventlog_subscriber *subscriber,
+ struct eventlog_session *session, struct eventlog_event_header *hdr,
+ const struct iovec *iov, int iovcnt, size_t payload_size,
+ uint16_t event_length, enum eventlog_level level, uint32_t keywords)
+{
+ struct eventlog_subscription *sub;
+ struct eventlog_provider *provider;
+
+ MPASS(subscriber != NULL);
+ MPASS(session != NULL);
+ MPASS(session->provider != NULL);
+ MPASS(hdr != NULL);
+ MPASS(event_length <= UINT16_MAX);
+
+ provider = session->provider;
+
+ /* Note: Called within SMR read section. */
+ CK_SLIST_FOREACH(sub, &subscriber->subscriptions, link) {
+ if (sub->provider != provider)
+ continue;
+ /*
+ * Only one subscription per provider per subscriber: return
+ * unconditionally below, even if the filter doesn't match.
+ */
+ if (level <= sub->level &&
+ (keywords & sub->keywords) != 0) {
+ if (subscriber->type ==
+ EVENTLOG_SUBSCRIBER_TYPE_DEVICE)
+ eventlog_subscriber_write_event_device(
+ subscriber, provider,
+ session->session_id, hdr, iov, iovcnt,
+ payload_size);
+ else
+ eventlog_subscriber_write_event_callback(
+ subscriber, provider,
+ session->session_id, hdr, iov, iovcnt,
+ payload_size);
+ }
+ return;
+ }
+}
+
+/*
+ * Query subscriber statistics.
+ */
+void
+eventlog_subscriber_get_stats(struct eventlog_subscriber *subscriber,
+ struct eventlog_stats *stats)
+{
+ MPASS(subscriber != NULL);
+ MPASS(stats != NULL);
+
+ stats->dropped_events = (uint64_t)atomic_load_acq_long(
+ &subscriber->dropped_events);
+}
+
+/*
+ * Device operations
+ */
+
+/*
+ * Device open handler. Subscriber is created via CREATE IOCTL.
+ * Only prison0 (the host) may open: the eventlog framework is host-global
+ * and not safe to expose to jailed processes.
+ */
+static int
+eventlog_dev_open(struct cdev *dev, int flags, int devtype __unused,
+ struct thread *td)
+{
+ if (jailed(td->td_ucred))
+ return (EPERM);
+
+ /* Only allow read access */
+ if (flags & (FWRITE | FEXEC | FAPPEND | O_TRUNC))
+ return (ENODEV);
+ return (0);
+}
+
+/*
+ * Device close handler.
+ */
+static int
+eventlog_dev_close(struct cdev *dev __unused, int flags __unused,
+ int devtype __unused, struct thread *td __unused)
+{
+ return (0); /* Cleanup is handled by eventlog_dev_clear_cdevpriv */
+}
+
+/*
+ * Cleanup cdevpriv data when device is closed.
+ */
+static void
+eventlog_dev_clear_cdevpriv(void *data)
+{
+ /* Handle case where CREATE failed and no subscriber was created */
+ if (data == NULL)
+ return;
+
+ eventlog_subscriber_destroy((struct eventlog_subscriber *)data);
+}
+
+/*
+ * Device ioctl handler.
+ */
+static int
+eventlog_dev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int flags,
+ struct thread *td)
+{
+ struct eventlog_subscriber *subscriber;
+ struct eventlog_subscription_req *sub_req;
+ uint32_t i;
+ int error;
+
+ switch (IOCBASECMD(cmd)) {
+ case IOCBASECMD(EVENTLOG_IOCTL_CREATE_BASE): {
+ size_t base_size;
+
+ base_size = __builtin_offsetof(
+ struct eventlog_create_req, subscriptions);
+ u_int ioctl_len = IOCPARM_LEN(cmd);
+ struct eventlog_create_req *req =
+ (struct eventlog_create_req *)data;
+
+ /* Check if subscriber already exists */
+ error = devfs_get_cdevpriv((void **)&subscriber);
+ if (error == 0)
+ return (EEXIST); /* Subscriber already exists */
+ if (error != ENOENT)
+ return (error); /* Something weird is going on */
+
+ /* Validate request size */
+ if (ioctl_len < base_size + sizeof(uint32_t) ||
+ ioctl_len < (base_size + req->count *
+ sizeof(struct eventlog_subscription_req)))
+ return (EINVAL);
+
+ if (req->buffer_size_per_cpu < EVENTLOG_BUFFER_SIZE_MIN ||
+ req->buffer_size_per_cpu > EVENTLOG_BUFFER_SIZE_MAX)
+ return (EINVAL);
+
+ /* Create subscriber with specified buffer size */
+ subscriber = eventlog_subscriber_create_device(
+ req->buffer_size_per_cpu);
+ MPASS(subscriber != NULL);
+
+ /* Process each subscription before setting cdevpriv. */
+ for (i = 0; i < req->count; i++) {
+ sub_req = &req->subscriptions[i];
+ error = eventlog_subscriber_add_subscription(
+ subscriber, sub_req->provider_name, sub_req->level,
+ sub_req->keywords, sub_req->flags);
+ if (error != 0) {
+ eventlog_subscriber_destroy(subscriber);
+ return (error);
+ }
+ }
+
+ /* Only store subscriber after all subscriptions succeed. */
+ error = devfs_set_cdevpriv(subscriber,
+ eventlog_dev_clear_cdevpriv);
+ if (error != 0) {
+ eventlog_subscriber_destroy(subscriber);
+ return (error);
+ }
+
+ return (0);
+ }
+
+ case IOCBASECMD(EVENTLOG_IOCTL_DESTROY): {
+ error = devfs_get_cdevpriv((void **)&subscriber);
+ if (error != 0)
+ return (error);
+
+ eventlog_subscriber_destroy(subscriber);
+ devfs_set_cdevpriv(NULL, NULL);
+
+ return (0);
+ }
+
+ case IOCBASECMD(EVENTLOG_IOCTL_GET_STATS): {
+ u_int ioctl_len = IOCPARM_LEN(cmd);
+ if (ioctl_len < sizeof(struct eventlog_stats))
+ return (EINVAL);
+
+ error = devfs_get_cdevpriv((void **)&subscriber);
+ if (error != 0)
+ return (error);
+
+ eventlog_subscriber_get_stats(subscriber,
+ (struct eventlog_stats *)data);
+
+ return (0);
+ }
+
+ case IOCBASECMD(EVENTLOG_IOCTL_GET_PROVIDERS): {
+ struct eventlog_get_providers_resp *resp;
+ struct eventlog_subscription *sub;
+ uint32_t count = 0;
+
+ error = devfs_get_cdevpriv((void **)&subscriber);
+ if (error != 0)
+ return (error);
+
+ resp = (struct eventlog_get_providers_resp *)data;
+ memset(resp, 0, sizeof(*resp));
+
+ smr_enter(evl.smr);
+ CK_SLIST_FOREACH(sub, &subscriber->subscriptions, link) {
+ if (count >= EVENTLOG_MAX_PROVIDERS)
+ break;
+ resp->providers[count].provider_id =
+ sub->provider->provider_id;
+ strlcpy(resp->providers[count].name,
+ sub->provider->name,
+ EVENTLOG_PROVIDER_NAME_MAX);
+ count++;
+ }
+ smr_exit(evl.smr);
+ resp->count = count;
+
+ return (0);
+ }
+
+ default:
+ return (ENOTTY);
+ }
+}
+
+/*
+ * Device read handler - reads from subscriber's per-CPU buffers.
+ */
+static int
+eventlog_dev_read(struct cdev *dev, struct uio *uio, int flags)
+{
+ int error;
+ struct eventlog_subscriber *subscriber;
+
+ error = devfs_get_cdevpriv((void **)&subscriber);
+ if (error != 0)
+ return (error);
+
+ return (eventlog_subscriber_read(subscriber, uio, flags));
+}
+
+static struct cdevsw eventlog_cdevsw = {
+ .d_version = D_VERSION,
+ .d_open = eventlog_dev_open,
+ .d_close = eventlog_dev_close,
+ .d_read = eventlog_dev_read,
+ .d_ioctl = eventlog_dev_ioctl,
+ .d_name = "eventlog",
+};
+
+/* Initialize single system-wide eventlog device */
+static void
+eventlog_device_init(void *unused)
+{
+ struct make_dev_args mda;
+ int error;
+
+ make_dev_args_init(&mda);
+ mda.mda_devsw = &eventlog_cdevsw;
+ mda.mda_uid = UID_ROOT;
+ mda.mda_gid = GID_OPERATOR;
+ mda.mda_mode = 0640;
+ mda.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
+ error = make_dev_s(&mda, &evl.device, "eventlog");
+ if (error != 0) {
+ printf("eventlog: failed to create device: %d\n", error);
+ return;
+ }
+}
+SYSINIT(eventlog_device, SI_SUB_DRIVERS, SI_ORDER_MIDDLE,
+ eventlog_device_init, NULL);
diff --git a/sys/kern/kern_eventlog_test.c b/sys/kern/kern_eventlog_test.c
new file mode 100644
--- /dev/null
+++ b/sys/kern/kern_eventlog_test.c
@@ -0,0 +1,5173 @@
+/*
+ * Copyright (c) 2026 Netflix, Inc.
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <tests/ktest.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/eventlog.h>
+#include <sys/eventlog_subscriber.h>
+#include <sys/sysctl.h>
+#include <sys/condvar.h>
+#include <sys/kthread.h>
+#include <sys/mutex.h>
+#include <sys/sleepqueue.h>
+#include <sys/sx.h>
+#include <sys/malloc.h>
+#include <sys/time.h>
+#include <sys/proc.h>
+#include <sys/uio.h>
+#include <sys/fcntl.h>
+#include <machine/atomic.h>
+#include <sys/callout.h>
+#include <sys/libkern.h>
+#include <eventlog/test_eventlog.h>
+
+MALLOC_DEFINE(M_EVENTLOG_TEST, "eventlog_test", "eventlog test subsystem");
+
+#define KTEST_VERIFY(x) do { \
+ if (!(x)) { \
+ KTEST_ERR(ctx, "FAIL: %s", #x); \
+ return (EINVAL); \
+ } else { \
+ KTEST_LOG(ctx, "PASS: %s", #x); \
+ } \
+} while (0)
+
+#define KTEST_EQUAL(x, y) do { \
+ if ((x) != (y)) { \
+ KTEST_ERR(ctx, "FAIL: %s != %s (%d != %d)", #x, #y, (x), (y)); \
+ return (EINVAL); \
+ } else { \
+ KTEST_LOG(ctx, "PASS: %s == %s", #x, #y); \
+ } \
+} while (0)
+
+#define KTEST_NEQUAL(x, y) do { \
+ if ((x) == (y)) { \
+ KTEST_ERR(ctx, "FAIL: %s == %s", #x, #y); \
+ return (EINVAL); \
+ } else { \
+ KTEST_LOG(ctx, "PASS: %s != %s", #x, #y); \
+ } \
+} while (0)
+
+#define EVENTLOG_SUBSCRIBER_BUFFER_SIZE_DEFAULT (64 * 1024)
+
+/*
+ * Helper: read from subscriber into kernel buffer via uio. Returns bytes read
+ * or 0 on error.
+ */
+static size_t
+eventlog_read_into_buf(struct eventlog_subscriber *subscriber,
+ void *buf, size_t bufsize, int flags)
+{
+ struct uio uio;
+ struct iovec iov;
+ int error;
+
+ iov.iov_base = buf;
+ iov.iov_len = bufsize;
+ uio.uio_iov = &iov;
+ uio.uio_iovcnt = 1;
+ uio.uio_offset = 0;
+ uio.uio_resid = bufsize;
+ uio.uio_segflg = UIO_SYSSPACE;
+ uio.uio_rw = UIO_READ;
+ uio.uio_td = curthread;
+
+ error = eventlog_subscriber_read(subscriber, &uio, flags);
+ if (error != 0)
+ return (0);
+ return (bufsize - uio.uio_resid);
+}
+
+/* Callback test data structure */
+struct test_callback_data {
+ volatile uint32_t event_count;
+ volatile uint32_t last_event_id;
+ volatile const void *last_payload;
+ volatile size_t last_payload_size;
+ /* Only used for reading in test code, not in callback */
+ struct mtx lock;
+};
+
+/*
+ * Callback for tests that peek at last_payload after the callback
+ * returns. Only safe for iovcnt <= 1 where iov[0].iov_base points at
+ * the caller's buffer; iovcnt > 1 would need to copy.
+ */
+static void
+test_event_callback(const struct eventlog_event_header *hdr,
+ const char *provider_name, uint8_t provider_name_len,
+ uint64_t session_id,
+ const struct iovec *iov, int iovcnt, size_t payload_size,
+ void *callback_arg)
+{
+ struct test_callback_data *data;
+
+ data = (struct test_callback_data *)callback_arg;
+ atomic_add_int(&data->event_count, 1);
+ atomic_store_rel_32(&data->last_event_id, hdr->event_id);
+ data->last_payload = (iovcnt >= 1) ? iov[0].iov_base : NULL;
+ atomic_store_rel_long(&data->last_payload_size, payload_size);
+}
+
+/*
+ * Helper function to enable a provider for testing by creating a callback
+ * subscriber and subscription. Returns the subscriber and callback data, which
+ * should be destroyed after the test completes.
+ */
+static struct eventlog_subscriber *
+test_enable_provider_callback(const char *provider_name,
+ enum eventlog_level level, uint32_t keywords,
+ struct test_callback_data **callback_data_out)
+{
+ struct eventlog_subscriber *subscriber;
+ struct test_callback_data *callback_data;
+
+ callback_data = malloc(sizeof(*callback_data), M_EVENTLOG_TEST,
+ M_WAITOK | M_ZERO);
+ mtx_init(&callback_data->lock, "test_callback", NULL, MTX_DEF);
+ callback_data->event_count = 0;
+
+ subscriber = eventlog_subscriber_create_callback(test_event_callback,
+ callback_data);
+ if (subscriber == NULL) {
+ mtx_destroy(&callback_data->lock);
+ free(callback_data, M_EVENTLOG_TEST);
+ return (NULL);
+ }
+
+ if (eventlog_subscriber_add_subscription(subscriber, provider_name,
+ level, keywords, 0) != 0) {
+ eventlog_subscriber_destroy(subscriber);
+ mtx_destroy(&callback_data->lock);
+ free(callback_data, M_EVENTLOG_TEST);
+ return (NULL);
+ }
+
+ *callback_data_out = callback_data;
+ return (subscriber);
+}
+
+/*
+ * Helper function to enable a provider for testing by creating a device
+ * subscriber and subscription. Returns the subscriber, which should be
+ * destroyed after the test completes. Use this when testing device-specific
+ * functionality.
+ */
+static struct eventlog_subscriber *
+test_enable_provider_device(const char *provider_name,
+ enum eventlog_level level, uint32_t keywords)
+{
+ struct eventlog_subscriber *subscriber;
+
+ subscriber = eventlog_subscriber_create_device(
+ EVENTLOG_SUBSCRIBER_BUFFER_SIZE_DEFAULT);
+ if (subscriber == NULL)
+ return (NULL);
+
+ if (eventlog_subscriber_add_subscription(subscriber, provider_name,
+ level, keywords, 0) != 0) {
+ eventlog_subscriber_destroy(subscriber);
+ return (NULL);
+ }
+
+ return (subscriber);
+}
+
+/*
+ * Legacy helper - defaults to callback for easier verification.
+ */
+static struct eventlog_subscriber *
+test_enable_provider(const char *provider_name, enum eventlog_level level,
+ uint32_t keywords)
+{
+ struct test_callback_data *unused;
+ return (test_enable_provider_callback(provider_name, level, keywords,
+ &unused));
+}
+
+static struct eventlog_provider *
+test_create_provider(const char *name,
+ eventlog_provider_dump_state_t dump_cb, void *dump_arg)
+{
+ struct eventlog_provider_config cfg = {
+ .dump_callback = dump_cb,
+ .dump_callback_arg = dump_arg,
+ };
+ struct eventlog_provider *p;
+
+ p = eventlog_provider_create(name, &cfg);
+ if (p != NULL)
+ eventlog_provider_set_default(p, 1);
+ return (p);
+}
+
+/*
+ * Validates provider initialization and cleanup.
+ */
+KTEST_FUNC(provider_init_cleanup)
+{
+ struct eventlog_provider *provider;
+
+ KTEST_LOG(ctx, "Testing provider initialization and cleanup");
+
+ provider = test_create_provider("test_init", NULL, NULL);
+ KTEST_NEQUAL(provider, NULL);
+ KTEST_EQUAL(eventlog_provider_get_level(provider), EVENTLOG_LEVEL_NONE);
+ KTEST_EQUAL(eventlog_provider_get_keywords(provider), 0);
+
+ eventlog_provider_destroy(provider);
+
+ return (0);
+}
+
+/*
+ * Validates session creation and destruction.
+ */
+KTEST_FUNC(session_create_destroy)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+
+ KTEST_LOG(ctx, "Testing session creation and destruction");
+
+ /* NULL provider returns NULL */
+ session = eventlog_session_create(NULL, 0, true, NULL, 0);
+ KTEST_EQUAL(session, NULL);
+
+ provider = test_create_provider("test_sess", NULL, NULL);
+ KTEST_NEQUAL(provider, NULL);
+
+ session = eventlog_session_create(provider, 0, true, NULL, 0);
+ KTEST_NEQUAL(session, NULL);
+
+ eventlog_session_destroy(session);
+ eventlog_provider_destroy(provider);
+
+ return (0);
+}
+
+/*
+ * Validates basic event logging functionality.
+ */
+KTEST_FUNC(event_logging_basic)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ uint32_t test_id = 0x12345678;
+ uint32_t test_data = 0xdeadbeef;
+
+ KTEST_LOG(ctx, "Testing basic event logging");
+
+ provider = test_create_provider("test_basic", NULL, NULL);
+ KTEST_NEQUAL(provider, NULL);
+ /* Enable provider for testing */
+ struct eventlog_subscriber *test_sub;
+
+ test_sub = test_enable_provider("test_basic", EVENTLOG_LEVEL_VERBOSE,
+ 0xFFFFFFFF);
+ KTEST_NEQUAL(test_sub, NULL);
+ session = eventlog_session_create(provider, 0, true, NULL, 0);
+ KTEST_NEQUAL(session, NULL);
+
+ /* Write event with test data */
+ eventlog_event_write(session, test_id, EVENTLOG_LEVEL_INFO, 0xFFFFFFFF,
+ &test_data, sizeof(test_data));
+
+ eventlog_session_destroy(session);
+ eventlog_subscriber_destroy(test_sub);
+ eventlog_provider_destroy(provider);
+
+ return (0);
+}
+
+/*
+ * Validates multiple events can be logged.
+ */
+KTEST_FUNC(event_logging_multiple)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct eventlog_subscriber *test_sub;
+ struct test_callback_data *callback_data;
+ uint32_t test_id1 = 0x11111111;
+ uint32_t test_id2 = 0x22222222;
+ uint32_t test_id3 = 0x33333333;
+ uint32_t data1 = 0xAAAAAAAA;
+ uint32_t data2 = 0xBBBBBBBB;
+ uint32_t data3 = 0xCCCCCCCC;
+
+ KTEST_LOG(ctx, "Testing multiple event logging");
+
+ provider = test_create_provider("test_multi", NULL, NULL);
+ KTEST_NEQUAL(provider, NULL);
+ /* Enable provider for testing with callback subscriber */
+ test_sub = test_enable_provider_callback("test_multi",
+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, &callback_data);
+ KTEST_NEQUAL(test_sub, NULL);
+ session = eventlog_session_create(provider, 0, true, NULL, 0);
+ KTEST_NEQUAL(session, NULL);
+
+ eventlog_event_write(session, test_id1, EVENTLOG_LEVEL_INFO,
+ 0xFFFFFFFF, &data1, sizeof(data1));
+ eventlog_event_write(session, test_id2, EVENTLOG_LEVEL_INFO,
+ 0xFFFFFFFF, &data2, sizeof(data2));
+ eventlog_event_write(session, test_id3, EVENTLOG_LEVEL_INFO,
+ 0xFFFFFFFF, &data3, sizeof(data3));
+
+ /*
+ * Verify all three events were received (read then unlock;
+ * KTEST_EQUAL may sleep)
+ */
+ {
+ uint32_t ec, eid;
+ uint32_t last_payload_val;
+ mtx_lock(&callback_data->lock);
+ ec = atomic_load_acq_32(&callback_data->event_count);
+ eid = atomic_load_acq_32(&callback_data->last_event_id);
+ last_payload_val = *(volatile const uint32_t *)
+ callback_data->last_payload;
+ mtx_unlock(&callback_data->lock);
+ KTEST_EQUAL(ec, 4); /* SESSION_CREATE + 3 user events */
+ KTEST_EQUAL(eid, test_id3);
+ KTEST_EQUAL(last_payload_val, data3);
+ }
+
+ eventlog_session_destroy(session);
+ eventlog_subscriber_destroy(test_sub);
+ mtx_destroy(&callback_data->lock);
+ free(callback_data, M_EVENTLOG_TEST);
+ eventlog_provider_destroy(provider);
+
+ return (0);
+}
+
+/*
+ * Validates multiple providers can coexist.
+ */
+KTEST_FUNC(provider_independence)
+{
+ struct eventlog_provider *provider1, *provider2;
+ struct eventlog_session *session1, *session2;
+
+ KTEST_LOG(ctx, "Testing provider independence");
+
+ provider1 = test_create_provider("test_provider1", NULL, NULL);
+ KTEST_NEQUAL(provider1, NULL);
+ provider2 = test_create_provider("test_provider2", NULL, NULL);
+ KTEST_NEQUAL(provider2, NULL);
+
+ session1 = eventlog_session_create(provider1, 0, true, NULL, 0);
+ session2 = eventlog_session_create(provider2, 0, true, NULL, 0);
+ KTEST_NEQUAL(session1, NULL);
+ KTEST_NEQUAL(session2, NULL);
+
+ eventlog_session_destroy(session1);
+ eventlog_session_destroy(session2);
+ eventlog_provider_destroy(provider1);
+ eventlog_provider_destroy(provider2);
+
+ return (0);
+}
+
+/*
+ * Validates event data integrity - verifies that multiple events are stored
+ * independently and don't interfere with each other.
+ */
+KTEST_FUNC(event_data_integrity)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct eventlog_subscriber *test_sub;
+ struct test_callback_data *callback_data;
+ uint32_t test_id1 = 0x11111111;
+ uint32_t test_id2 = 0x22222222;
+ uint32_t test_id3 = 0x33333333;
+ uint32_t test_data1[4] = {
+ 0xAAAAAAAA, 0xBBBBBBBB, 0xCCCCCCCC, 0xDDDDDDDD };
+ uint32_t test_data2[4] = {
+ 0x11111111, 0x22222222, 0x33333333, 0x44444444 };
+ uint32_t test_data3[4] = {
+ 0x55555555, 0x66666666, 0x77777777, 0x88888888 };
+ volatile const uint32_t *received_data;
+ int i;
+
+ KTEST_LOG(ctx,
+ "Testing event data integrity - multiple independent events");
+
+ provider = test_create_provider("test_integ", NULL, NULL);
+ KTEST_NEQUAL(provider, NULL);
+ /* Enable provider for testing with callback subscriber */
+ test_sub = test_enable_provider_callback("test_integ",
+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, &callback_data);
+ KTEST_NEQUAL(test_sub, NULL);
+ session = eventlog_session_create(provider, 0, true, NULL, 0);
+ KTEST_NEQUAL(session, NULL);
+
+ /* Write events with different data */
+ eventlog_event_write(session, test_id1, EVENTLOG_LEVEL_INFO,
+ 0xFFFFFFFF, test_data1, sizeof(test_data1));
+ eventlog_event_write(session, test_id2, EVENTLOG_LEVEL_INFO,
+ 0xFFFFFFFF, test_data2, sizeof(test_data2));
+ eventlog_event_write(session, test_id3, EVENTLOG_LEVEL_INFO,
+ 0xFFFFFFFF, test_data3, sizeof(test_data3));
+
+ /*
+ * Verify all events were received with correct data (read then unlock;
+ * KTEST_EQUAL may sleep)
+ */
+ {
+ uint32_t ec, eid;
+ size_t plen;
+ uint32_t payload_copy[4];
+ mtx_lock(&callback_data->lock);
+ ec = atomic_load_acq_32(&callback_data->event_count);
+ eid = atomic_load_acq_32(&callback_data->last_event_id);
+ plen = atomic_load_acq_long(&callback_data->last_payload_size);
+ received_data = (volatile const uint32_t *)
+ callback_data->last_payload;
+ for (i = 0; i < 4; i++)
+ payload_copy[i] = received_data[i];
+ mtx_unlock(&callback_data->lock);
+ KTEST_EQUAL(ec, 4); /* SESSION_CREATE + 3 user events */
+ KTEST_EQUAL(eid, test_id3);
+ KTEST_EQUAL(plen, sizeof(test_data3));
+ for (i = 0; i < 4; i++)
+ KTEST_EQUAL(payload_copy[i], test_data3[i]);
+ }
+
+ eventlog_session_destroy(session);
+ eventlog_subscriber_destroy(test_sub);
+ mtx_destroy(&callback_data->lock);
+ free(callback_data, M_EVENTLOG_TEST);
+ eventlog_provider_destroy(provider);
+
+ return (0);
+}
+
+/*
+ * Validates different event sizes - creates events and writes full payloads.
+ */
+KTEST_FUNC(event_size_variations)
+{
+ struct eventlog_provider *provider_small, *provider_large;
+ struct eventlog_session *session_small, *session_large;
+ uint32_t test_id_small = 0x1111;
+ uint32_t test_id_large = 0x2222;
+ size_t i;
+ const size_t small_size = 64;
+ const size_t large_size = 4096;
+
+ KTEST_LOG(ctx, "Testing different event sizes with full payloads");
+
+ provider_small = test_create_provider("test_small", NULL, NULL);
+ KTEST_NEQUAL(provider_small, NULL);
+ provider_large = test_create_provider("test_large", NULL, NULL);
+ KTEST_NEQUAL(provider_large, NULL);
+ /* Enable providers for testing with callback subscribers */
+ struct test_callback_data *callback_data_small, *callback_data_large;
+ struct eventlog_subscriber *test_sub_small;
+ struct eventlog_subscriber *test_sub_large;
+
+ test_sub_small = test_enable_provider_callback("test_small",
+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, &callback_data_small);
+ test_sub_large = test_enable_provider_callback("test_large",
+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, &callback_data_large);
+ KTEST_NEQUAL(test_sub_small, NULL);
+ KTEST_NEQUAL(test_sub_large, NULL);
+
+ session_small = eventlog_session_create(provider_small, 0, true, NULL,
+ 0);
+ session_large = eventlog_session_create(provider_large, 0, true, NULL,
+ 0);
+ KTEST_NEQUAL(session_small, NULL);
+ KTEST_NEQUAL(session_large, NULL);
+
+ /* Create small event payload */
+ uint8_t data_small[small_size];
+ for (i = 0; i < small_size; i++) {
+ data_small[i] = (uint8_t)(i & 0xFF);
+ }
+ eventlog_event_write(session_small, test_id_small, EVENTLOG_LEVEL_INFO,
+ 0xFFFFFFFF, data_small, sizeof(data_small));
+
+ /* Create large event payload */
+ uint8_t data_large[large_size];
+ for (i = 0; i < large_size; i++) {
+ data_large[i] = (uint8_t)((i ^ 0xAA) & 0xFF);
+ }
+ eventlog_event_write(session_large, test_id_large, EVENTLOG_LEVEL_INFO,
+ 0xFFFFFFFF, data_large, sizeof(data_large));
+
+ /*
+ * Verify events were received (read then unlock; KTEST_EQUAL may
+ * sleep)
+ */
+ {
+ uint32_t ec_small, eid_small, ec_large, eid_large;
+ size_t plen_small, plen_large;
+ mtx_lock(&callback_data_small->lock);
+ ec_small = atomic_load_acq_32(
+ &callback_data_small->event_count);
+ eid_small = atomic_load_acq_32(
+ &callback_data_small->last_event_id);
+ plen_small = atomic_load_acq_long(
+ &callback_data_small->last_payload_size);
+ mtx_unlock(&callback_data_small->lock);
+ /* SESSION_CREATE + 1 user event */
+ KTEST_EQUAL(ec_small, 2);
+ KTEST_EQUAL(eid_small, test_id_small);
+ KTEST_EQUAL(plen_small, sizeof(data_small));
+
+ mtx_lock(&callback_data_large->lock);
+ ec_large = atomic_load_acq_32(
+ &callback_data_large->event_count);
+ eid_large = atomic_load_acq_32(
+ &callback_data_large->last_event_id);
+ plen_large = atomic_load_acq_long(
+ &callback_data_large->last_payload_size);
+ mtx_unlock(&callback_data_large->lock);
+ /* SESSION_CREATE + 1 user event */
+ KTEST_EQUAL(ec_large, 2);
+ KTEST_EQUAL(eid_large, test_id_large);
+ KTEST_EQUAL(plen_large, sizeof(data_large));
+ }
+
+ eventlog_session_destroy(session_small);
+ eventlog_session_destroy(session_large);
+ eventlog_subscriber_destroy(test_sub_small);
+ eventlog_subscriber_destroy(test_sub_large);
+ mtx_destroy(&callback_data_small->lock);
+ mtx_destroy(&callback_data_large->lock);
+ free(callback_data_small, M_EVENTLOG_TEST);
+ free(callback_data_large, M_EVENTLOG_TEST);
+ eventlog_provider_destroy(provider_small);
+ eventlog_provider_destroy(provider_large);
+
+ return (0);
+}
+
+/* Structure for passing data to thread function */
+struct mt_test_data {
+ struct eventlog_session *session;
+ uint32_t thread_id;
+ uint32_t num_events;
+ uint32_t events_created;
+ struct mtx completion_mtx;
+ int done;
+};
+
+/* Thread function that creates events */
+static void
+mt_event_thread(void *arg)
+{
+ struct mt_test_data *data = (struct mt_test_data *)arg;
+ struct eventlog_session *session = data->session;
+ uint32_t event_data[2];
+ uint32_t i;
+ uint32_t event_id_base = data->thread_id * 0x10000;
+
+ for (i = 0; i < data->num_events; i++) {
+ /* Write thread ID and event index as data */
+ event_data[0] = data->thread_id;
+ event_data[1] = i;
+
+ eventlog_event_write(session, event_id_base + i,
+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF, event_data,
+ sizeof(event_data));
+ data->events_created++;
+ }
+
+ /* Signal completion */
+ mtx_lock(&data->completion_mtx);
+ data->done = 1;
+ wakeup(&data->done);
+ mtx_unlock(&data->completion_mtx);
+
+ kthread_exit();
+}
+
+/*
+ * Validates multi-threaded event logging - creates a thread and has both
+ * threads create many events concurrently to test for race conditions.
+ */
+KTEST_FUNC(multithreaded_logging)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct mt_test_data thread_data;
+ struct thread *thread;
+ uint32_t main_thread_id = 0xAAAA;
+ uint32_t thread_id = 0xBBBB;
+ uint32_t num_events_per_thread = 100;
+ uint32_t i;
+ int error;
+
+ KTEST_LOG(ctx, "Testing multi-threaded event logging");
+
+ provider = test_create_provider("test_mt", NULL, NULL);
+ KTEST_NEQUAL(provider, NULL);
+ /* Enable provider for testing with callback subscriber */
+ struct test_callback_data *callback_data;
+ struct eventlog_subscriber *test_sub;
+
+ test_sub = test_enable_provider_callback("test_mt",
+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, &callback_data);
+ KTEST_NEQUAL(test_sub, NULL);
+ session = eventlog_session_create(provider, 0, true, NULL, 0);
+ KTEST_NEQUAL(session, NULL);
+
+ /* Initialize thread data structure */
+ bzero(&thread_data, sizeof(thread_data));
+ thread_data.session = session;
+ thread_data.thread_id = thread_id;
+ thread_data.num_events = num_events_per_thread;
+ thread_data.events_created = 0;
+ thread_data.done = 0;
+ mtx_init(&thread_data.completion_mtx, "mt_test", NULL, MTX_DEF);
+
+ /* Create the thread */
+ error = kthread_add(mt_event_thread, &thread_data, NULL, &thread,
+ 0, 0, "eventlog_mt_test");
+ KTEST_EQUAL(error, 0);
+
+ /* Main thread creates events concurrently with the new thread */
+ uint32_t main_event_data[2];
+ for (i = 0; i < num_events_per_thread; i++) {
+ main_event_data[0] = main_thread_id;
+ main_event_data[1] = i;
+ eventlog_event_write(session, main_thread_id + i,
+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF, main_event_data,
+ sizeof(main_event_data));
+ }
+
+ /* Wait for thread to complete */
+ mtx_lock(&thread_data.completion_mtx);
+ while (thread_data.done == 0) {
+ msleep(&thread_data.done, &thread_data.completion_mtx, 0,
+ "mt_wait", 0);
+ }
+ mtx_unlock(&thread_data.completion_mtx);
+
+ /* Verify thread created expected number of events */
+ KTEST_EQUAL(thread_data.events_created, num_events_per_thread);
+
+ /*
+ * Verify total events received via callback (read then unlock;
+ * KTEST_EQUAL may sleep)
+ */
+ {
+ uint32_t ec;
+ mtx_lock(&callback_data->lock);
+ ec = atomic_load_acq_32(&callback_data->event_count);
+ mtx_unlock(&callback_data->lock);
+ /* SESSION_CREATE + events from 2 threads */
+ KTEST_EQUAL(ec, 1 + num_events_per_thread * 2);
+ }
+
+ mtx_destroy(&thread_data.completion_mtx);
+ eventlog_session_destroy(session);
+ eventlog_subscriber_destroy(test_sub);
+ mtx_destroy(&callback_data->lock);
+ free(callback_data, M_EVENTLOG_TEST);
+ eventlog_provider_destroy(provider);
+
+ return (0);
+}
+
+/*
+ * Validates subscriber creation and destruction for both types.
+ */
+KTEST_FUNC(subscriber_create_destroy)
+{
+ struct eventlog_subscriber *subscriber_device, *subscriber_callback;
+ struct test_callback_data *callback_data;
+
+ KTEST_LOG(ctx, "Testing subscriber creation and destruction");
+
+ /* Test device subscriber */
+ subscriber_device = eventlog_subscriber_create_device(
+ EVENTLOG_SUBSCRIBER_BUFFER_SIZE_DEFAULT);
+ KTEST_NEQUAL(subscriber_device, NULL);
+ eventlog_subscriber_destroy(subscriber_device);
+
+ /* Test callback subscriber */
+ callback_data = malloc(sizeof(*callback_data), M_EVENTLOG_TEST,
+ M_WAITOK | M_ZERO);
+ mtx_init(&callback_data->lock, "test_callback", NULL, MTX_DEF);
+ subscriber_callback = eventlog_subscriber_create_callback(
+ test_event_callback, callback_data);
+ KTEST_NEQUAL(subscriber_callback, NULL);
+ eventlog_subscriber_destroy(subscriber_callback);
+ mtx_destroy(&callback_data->lock);
+ free(callback_data, M_EVENTLOG_TEST);
+
+ return (0);
+}
+
+
+/*
+ * Validates multiple subscribers with the same provider.
+ */
+KTEST_FUNC(subscriber_multiple_subscribers)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_subscriber *sub1, *sub2, *sub3;
+ struct test_callback_data *callback_data2, *callback_data3;
+ int error;
+
+ KTEST_LOG(ctx, "Testing multiple subscribers with same provider");
+
+ provider = test_create_provider("test_msub", NULL, NULL);
+ KTEST_NEQUAL(provider, NULL);
+
+ /* Mix device and callback subscribers */
+ sub1 = eventlog_subscriber_create_device(
+ EVENTLOG_SUBSCRIBER_BUFFER_SIZE_DEFAULT);
+ callback_data2 = malloc(sizeof(*callback_data2), M_EVENTLOG_TEST,
+ M_WAITOK | M_ZERO);
+ mtx_init(&callback_data2->lock, "test_callback2", NULL, MTX_DEF);
+ sub2 = eventlog_subscriber_create_callback(test_event_callback,
+ callback_data2);
+ callback_data3 = malloc(sizeof(*callback_data3), M_EVENTLOG_TEST,
+ M_WAITOK | M_ZERO);
+ mtx_init(&callback_data3->lock, "test_callback3", NULL, MTX_DEF);
+ sub3 = eventlog_subscriber_create_callback(test_event_callback,
+ callback_data3);
+ KTEST_NEQUAL(sub1, NULL);
+ KTEST_NEQUAL(sub2, NULL);
+ KTEST_NEQUAL(sub3, NULL);
+
+ /* Each subscriber subscribes with different parameters */
+ error = eventlog_subscriber_add_subscription(sub1, "test_msub",
+ EVENTLOG_LEVEL_INFO, 0x1, 0);
+ KTEST_EQUAL(error, 0);
+
+ error = eventlog_subscriber_add_subscription(sub2, "test_msub",
+ EVENTLOG_LEVEL_WARN, 0x2, 0);
+ KTEST_EQUAL(error, 0);
+
+ error = eventlog_subscriber_add_subscription(sub3, "test_msub",
+ EVENTLOG_LEVEL_VERBOSE, 0x4, 0);
+ KTEST_EQUAL(error, 0);
+
+ /*
+ * Verify provider enablement: keywords OR'ed (0x1 | 0x2 | 0x4 = 0x7),
+ * level is MAX (most verbose) = VERBOSE
+ */
+ KTEST_EQUAL(eventlog_provider_get_keywords(provider), 0x7);
+ KTEST_EQUAL(eventlog_provider_get_level(provider),
+ EVENTLOG_LEVEL_VERBOSE);
+
+ /* Remove one subscriber */
+ eventlog_subscriber_destroy(sub2);
+ mtx_destroy(&callback_data2->lock);
+ free(callback_data2, M_EVENTLOG_TEST);
+
+ /* Provider should still be enabled with remaining subscribers */
+ /* 0x1 | 0x4 */
+ KTEST_EQUAL(eventlog_provider_get_keywords(provider), 0x5);
+ /* MAX(INFO, VERBOSE) */
+ KTEST_EQUAL(eventlog_provider_get_level(provider),
+ EVENTLOG_LEVEL_VERBOSE);
+
+ /* Remove all remaining subscribers */
+ eventlog_subscriber_destroy(sub1);
+ eventlog_subscriber_destroy(sub3);
+ mtx_destroy(&callback_data3->lock);
+ free(callback_data3, M_EVENTLOG_TEST);
+
+ /* Provider should be disabled */
+ KTEST_EQUAL(eventlog_provider_get_keywords(provider), 0);
+ KTEST_EQUAL(eventlog_provider_get_level(provider), EVENTLOG_LEVEL_NONE);
+
+ eventlog_provider_destroy(provider);
+
+ return (0);
+}
+
+/*
+ * Validates provider enablement aggregation (OR keywords, MIN level).
+ */
+KTEST_FUNC(subscriber_provider_enablement_aggregation)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_subscriber *sub1, *sub2;
+ int error;
+
+ KTEST_LOG(ctx, "Testing provider enablement aggregation");
+
+ provider = test_create_provider("test_agg", NULL, NULL);
+ KTEST_NEQUAL(provider, NULL);
+
+ sub1 = eventlog_subscriber_create_device(
+ EVENTLOG_SUBSCRIBER_BUFFER_SIZE_DEFAULT);
+ sub2 = eventlog_subscriber_create_device(
+ EVENTLOG_SUBSCRIBER_BUFFER_SIZE_DEFAULT);
+ KTEST_NEQUAL(sub1, NULL);
+ KTEST_NEQUAL(sub2, NULL);
+
+ /* Subscriber 1: INFO level, keywords 0x1 */
+ error = eventlog_subscriber_add_subscription(sub1, "test_agg",
+ EVENTLOG_LEVEL_INFO, 0x1, 0);
+ KTEST_EQUAL(error, 0);
+ KTEST_EQUAL(eventlog_provider_get_level(provider), EVENTLOG_LEVEL_INFO);
+ KTEST_EQUAL(eventlog_provider_get_keywords(provider), 0x1);
+
+ /* Subscriber 2: WARN level, keywords 0x2 (should give INFO, 0x3) */
+ error = eventlog_subscriber_add_subscription(sub2, "test_agg",
+ EVENTLOG_LEVEL_WARN, 0x2, 0);
+ KTEST_EQUAL(error, 0);
+ /* MAX(INFO, WARN) */
+ KTEST_EQUAL(eventlog_provider_get_level(provider), EVENTLOG_LEVEL_INFO);
+ /* 0x1 | 0x2 */
+ KTEST_EQUAL(eventlog_provider_get_keywords(provider), 0x3);
+
+ /* Update subscriber 1 to VERBOSE (should give VERBOSE, since MAX) */
+ error = eventlog_subscriber_add_subscription(sub1, "test_agg",
+ EVENTLOG_LEVEL_VERBOSE, 0x1, 0);
+ KTEST_EQUAL(error, 0);
+ /* MAX(VERBOSE, WARN) */
+ KTEST_EQUAL(eventlog_provider_get_level(provider),
+ EVENTLOG_LEVEL_VERBOSE);
+ /* Still 0x1 | 0x2 */
+ KTEST_EQUAL(eventlog_provider_get_keywords(provider), 0x3);
+
+ /* Update subscriber 2 to ERROR (should result in VERBOSE, since MAX) */
+ error = eventlog_subscriber_add_subscription(sub2, "test_agg",
+ EVENTLOG_LEVEL_ERROR, 0x2, 0);
+ KTEST_EQUAL(error, 0);
+ /* MAX(VERBOSE, ERROR) */
+ KTEST_EQUAL(eventlog_provider_get_level(provider),
+ EVENTLOG_LEVEL_VERBOSE);
+ /* Still 0x1 | 0x2 */
+ KTEST_EQUAL(eventlog_provider_get_keywords(provider), 0x3);
+
+ /* Cleanup */
+ eventlog_subscriber_destroy(sub1);
+ eventlog_subscriber_destroy(sub2);
+ eventlog_provider_destroy(provider);
+
+ return (0);
+}
+
+
+/*
+ * Validates device subscriber buffer functionality.
+ */
+KTEST_FUNC(subscriber_device_buffer)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct eventlog_subscriber *subscriber;
+ uint32_t test_id = 0x12345678;
+ uint32_t test_data = 0xdeadbeef;
+
+ KTEST_LOG(ctx, "Testing device subscriber buffer functionality");
+
+ provider = test_create_provider("test_devbuf", NULL, NULL);
+ KTEST_NEQUAL(provider, NULL);
+ subscriber = test_enable_provider_device("test_devbuf",
+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF);
+ KTEST_NEQUAL(subscriber, NULL);
+ session = eventlog_session_create(provider, 0, true, NULL, 0);
+ KTEST_NEQUAL(session, NULL);
+
+ /* Drain SESSION_CREATE from session creation */
+ char read_buf[1024];
+ size_t read;
+
+ read = eventlog_read_into_buf(subscriber, read_buf, sizeof(read_buf),
+ 0);
+ KTEST_VERIFY(read > 0);
+ read = eventlog_read_into_buf(subscriber, read_buf, sizeof(read_buf),
+ 0);
+ KTEST_EQUAL(read, 0);
+
+ /* Write event */
+ eventlog_event_write(session, test_id, EVENTLOG_LEVEL_INFO, 0xFFFFFFFF,
+ &test_data, sizeof(test_data));
+
+ /* Verify event was written to buffer */
+ read = eventlog_read_into_buf(subscriber, read_buf, sizeof(read_buf),
+ 0);
+ KTEST_VERIFY(read > 0);
+
+ /* Verify buffer is cleared after read */
+ read = eventlog_read_into_buf(subscriber, read_buf, sizeof(read_buf),
+ 0);
+ KTEST_EQUAL(read, 0);
+
+ eventlog_session_destroy(session);
+ eventlog_subscriber_destroy(subscriber);
+ eventlog_provider_destroy(provider);
+
+ return (0);
+}
+
+/*
+ * Validates double-buffering functionality.
+ * Tests that buffer swapping works correctly and eliminates read/write
+ * contention.
+ */
+KTEST_FUNC(subscriber_circular_buffer)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct eventlog_subscriber *subscriber;
+ uint32_t test_id = 0x12345678;
+ size_t i;
+ char *read_buf;
+ size_t read_buf_size = 256 * 1024;
+ size_t read;
+ struct eventlog_stats stats;
+
+ KTEST_LOG(ctx, "Testing double-buffering functionality");
+
+ read_buf = malloc(read_buf_size, M_EVENTLOG_TEST, M_WAITOK);
+ KTEST_NEQUAL(read_buf, NULL);
+
+ /* Create provider and subscriber */
+ provider = test_create_provider("test_circ", NULL, NULL);
+ if (provider == NULL) {
+ free(read_buf, M_EVENTLOG_TEST);
+ return (EINVAL);
+ }
+
+ /*
+ * Use a buffer size (128KB) - above 64KB minimum, triggers reasonable
+ * swaps
+ */
+ subscriber = eventlog_subscriber_create_device(128 * 1024);
+ if (subscriber == NULL) {
+ free(read_buf, M_EVENTLOG_TEST);
+ eventlog_provider_destroy(provider);
+ return (EINVAL);
+ }
+
+ if (eventlog_subscriber_add_subscription(subscriber, "test_circ",
+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, 0) != 0) {
+ free(read_buf, M_EVENTLOG_TEST);
+ eventlog_subscriber_destroy(subscriber);
+ eventlog_provider_destroy(provider);
+ return (EINVAL);
+ }
+
+ session = eventlog_session_create(provider, 0, true, NULL, 0);
+ if (session == NULL) {
+ free(read_buf, M_EVENTLOG_TEST);
+ eventlog_subscriber_destroy(subscriber);
+ eventlog_provider_destroy(provider);
+ return (EINVAL);
+ }
+
+ /*
+ * Calculate expected event size for diagnostics (header includes
+ * provider_id, session_id)
+ */
+ size_t expected_event_size = sizeof(struct eventlog_event_header) +
+ sizeof(uint32_t);
+ size_t max_events = (128 * 1024) / expected_event_size;
+ KTEST_LOG(ctx,
+ "Expected event size: %zu bytes, buffer size: %zu bytes, "
+ "max events: %zu",
+ expected_event_size, (size_t)(128 * 1024), max_events);
+
+ /*
+ * Fill active buffer - SESSION_CREATE is first, then max_events-1
+ * user events to avoid overflow
+ */
+ for (i = 0; i < max_events - 1; i++) {
+ uint32_t val = (uint32_t)i;
+ eventlog_event_write(session, test_id + i, EVENTLOG_LEVEL_INFO,
+ 0xFFFFFFFF, &val, sizeof(val));
+ }
+
+ /* Read all events - this should trigger buffer swap */
+ eventlog_subscriber_get_stats(subscriber, &stats);
+ read = eventlog_read_into_buf(subscriber, read_buf, read_buf_size, 0);
+ KTEST_LOG(ctx, "Read %zu bytes, dropped %llu events",
+ read, (unsigned long long)stats.dropped_events);
+ KTEST_VERIFY(read > 0);
+ /* Should not drop events if buffer is large enough */
+ KTEST_VERIFY(stats.dropped_events == 0);
+
+ /* Verify buffer is cleared after read */
+ read = eventlog_read_into_buf(subscriber, read_buf, read_buf_size, 0);
+ KTEST_EQUAL(read, 0);
+
+ /*
+ * Test buffer swap: write events, read some, then write more.
+ * After swap, writers continue on new active buffer, readers read
+ * from swapped buffer.
+ */
+ for (i = 0; i < 50; i++) {
+ uint32_t val = (uint32_t)i;
+ eventlog_event_write(session, test_id + i, EVENTLOG_LEVEL_INFO,
+ 0xFFFFFFFF, &val, sizeof(val));
+ }
+
+ /* Read half of them - this swaps buffers */
+ read = eventlog_read_into_buf(subscriber, read_buf, read_buf_size / 2,
+ 0);
+ KTEST_VERIFY(read > 0);
+
+ /*
+ * Write more events - these go to the new active buffer (no
+ * contention with reader).
+ */
+ for (i = 50; i < 100; i++) {
+ uint32_t val = (uint32_t)i;
+ eventlog_event_write(session, test_id + i, EVENTLOG_LEVEL_INFO,
+ 0xFFFFFFFF, &val, sizeof(val));
+ }
+
+ /* Read remaining events from reader buffer */
+ eventlog_subscriber_get_stats(subscriber, &stats);
+ read = eventlog_read_into_buf(subscriber, read_buf, read_buf_size, 0);
+ KTEST_VERIFY(read > 0);
+ KTEST_EQUAL(stats.dropped_events, 0);
+
+ eventlog_session_destroy(session);
+ eventlog_subscriber_destroy(subscriber);
+ eventlog_provider_destroy(provider);
+ free(read_buf, M_EVENTLOG_TEST);
+
+ return (0);
+}
+
+/*
+ * Validates callback subscriber functionality.
+ */
+KTEST_FUNC(subscriber_callback)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct eventlog_subscriber *subscriber;
+ struct test_callback_data *callback_data;
+ uint32_t test_id = 0x12345678;
+ uint32_t test_data = 0xdeadbeef;
+
+ KTEST_LOG(ctx, "Testing callback subscriber functionality");
+
+ provider = test_create_provider("test_cb", NULL, NULL);
+ KTEST_NEQUAL(provider, NULL);
+ subscriber = test_enable_provider_callback("test_cb",
+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, &callback_data);
+ KTEST_NEQUAL(subscriber, NULL);
+ session = eventlog_session_create(provider, 0, true, NULL, 0);
+ KTEST_NEQUAL(session, NULL);
+
+ /* Callback already received SESSION_CREATE from session creation */
+ KTEST_EQUAL(atomic_load_acq_32(&callback_data->event_count), 1);
+
+ /* Write event */
+ eventlog_event_write(session, test_id, EVENTLOG_LEVEL_INFO, 0xFFFFFFFF,
+ &test_data, sizeof(test_data));
+
+ /*
+ * Verify callback was invoked (read then unlock; KTEST_EQUAL may
+ * sleep)
+ */
+ {
+ uint32_t ec, eid, last_payload_val;
+ size_t plen;
+ mtx_lock(&callback_data->lock);
+ ec = atomic_load_acq_32(&callback_data->event_count);
+ eid = atomic_load_acq_32(&callback_data->last_event_id);
+ plen = atomic_load_acq_long(&callback_data->last_payload_size);
+ last_payload_val = *(volatile const uint32_t *)
+ callback_data->last_payload;
+ mtx_unlock(&callback_data->lock);
+ KTEST_EQUAL(ec, 2); /* SESSION_CREATE + 1 user event */
+ KTEST_EQUAL(eid, test_id);
+ KTEST_EQUAL(plen, sizeof(test_data));
+ KTEST_EQUAL(last_payload_val, test_data);
+ }
+
+ eventlog_session_destroy(session);
+ eventlog_subscriber_destroy(subscriber);
+ mtx_destroy(&callback_data->lock);
+ free(callback_data, M_EVENTLOG_TEST);
+ eventlog_provider_destroy(provider);
+
+ return (0);
+}
+
+/* Test data structure for concurrent read/write test */
+struct concurrent_test_data {
+ struct eventlog_session *session;
+ struct eventlog_subscriber *subscriber;
+ volatile int done;
+ int reader_exited; /* Protected by atomics; used as wait channel */
+ volatile uint64_t events_written;
+ volatile uint64_t events_read;
+ volatile uint64_t bytes_read;
+ struct mtx lock;
+};
+
+/* Writer thread - continuously writes events */
+static void
+concurrent_writer_thread(void *arg)
+{
+ struct concurrent_test_data *data = (struct concurrent_test_data *)arg;
+ uint32_t test_id = 0x1000;
+ uint32_t test_data[10];
+ int i;
+
+ for (i = 0; i < 10; i++)
+ test_data[i] = i;
+
+ while (data->done == 0) {
+ eventlog_event_write(data->session, test_id++,
+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF, test_data,
+ sizeof(test_data));
+ atomic_add_64(&data->events_written, 1);
+ kern_yield(PRI_UNCHANGED); /* Yield to allow reads */
+ }
+
+ kthread_exit();
+}
+
+/* Reader thread - continuously reads events, triggering swaps */
+static void
+concurrent_reader_thread(void *arg)
+{
+ struct concurrent_test_data *data = (struct concurrent_test_data *)arg;
+ char read_buf[8 * 1024];
+ size_t read_bytes;
+
+ while (data->done == 0) {
+ read_bytes = eventlog_read_into_buf(data->subscriber, read_buf,
+ sizeof(read_buf), 0);
+ if (read_bytes > 0) {
+ atomic_add_64(&data->bytes_read, read_bytes);
+ atomic_add_64(&data->events_read, 1);
+ }
+ kern_yield(PRI_UNCHANGED); /* Yield to allow writes */
+ }
+
+ atomic_store_rel_int(&data->reader_exited, 1);
+ wakeup(&data->reader_exited);
+ kthread_exit();
+}
+
+/*
+ * Validates double-buffering race conditions.
+ * Tests concurrent reads and writes with frequent buffer swaps to ensure
+ * no memory corruption or crashes occur.
+ */
+KTEST_FUNC(subscriber_double_buffer_race)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct eventlog_subscriber *subscriber;
+ struct concurrent_test_data test_data;
+ struct thread *writer_thread, *reader_thread;
+ int error;
+ uint64_t initial_written, initial_read, initial_bytes;
+
+ KTEST_LOG(ctx, "Testing double-buffering race conditions");
+
+ provider = test_create_provider("test_dbrace", NULL, NULL);
+ KTEST_NEQUAL(provider, NULL);
+
+ subscriber = eventlog_subscriber_create_device(128 * 1024);
+ KTEST_NEQUAL(subscriber, NULL);
+
+ if (eventlog_subscriber_add_subscription(subscriber, "test_dbrace",
+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, 0) != 0) {
+ eventlog_subscriber_destroy(subscriber);
+ eventlog_provider_destroy(provider);
+ return (EINVAL);
+ }
+
+ session = eventlog_session_create(provider, 0, true, NULL, 0);
+ KTEST_NEQUAL(session, NULL);
+
+ bzero(&test_data, sizeof(test_data));
+ test_data.session = session;
+ test_data.subscriber = subscriber;
+ test_data.done = 0;
+ test_data.reader_exited = 0;
+ test_data.events_written = 0;
+ test_data.events_read = 0;
+ test_data.bytes_read = 0;
+ mtx_init(&test_data.lock, "concurrent_test", NULL, MTX_DEF);
+
+ /* Pre-fill buffer to trigger initial swap */
+ uint32_t test_id = 0x2000;
+ uint32_t prefill_data[5];
+ for (int i = 0; i < 5; i++)
+ prefill_data[i] = i;
+
+ for (int i = 0; i < 50; i++) {
+ eventlog_event_write(session, test_id++, EVENTLOG_LEVEL_INFO,
+ 0xFFFFFFFF, prefill_data, sizeof(prefill_data));
+ }
+
+ /* Create writer thread */
+ error = kthread_add(concurrent_writer_thread, &test_data, NULL,
+ &writer_thread, 0, 0, "evtlog_writer");
+ KTEST_EQUAL(error, 0);
+
+ /* Create reader thread */
+ error = kthread_add(concurrent_reader_thread, &test_data, NULL,
+ &reader_thread, 0, 0, "evtlog_reader");
+ KTEST_EQUAL(error, 0);
+
+ /* Let threads run for a bit to exercise race conditions */
+ tsleep(&test_data, 0, "test_run", hz / 2); /* 500ms */
+
+ initial_written = atomic_load_acq_64(&test_data.events_written);
+ initial_read = atomic_load_acq_64(&test_data.events_read);
+ initial_bytes = atomic_load_acq_64(&test_data.bytes_read);
+
+ KTEST_LOG(ctx,
+ "After 500ms: wrote %llu events, read %llu times, %llu bytes",
+ (unsigned long long)initial_written,
+ (unsigned long long)initial_read,
+ (unsigned long long)initial_bytes);
+
+ /* Continue for another period to ensure stability */
+ tsleep(&test_data, 0, "test_run2", hz / 2); /* Another 500ms */
+
+ uint64_t final_written = atomic_load_acq_64(&test_data.events_written);
+ uint64_t final_read = atomic_load_acq_64(&test_data.events_read);
+ uint64_t final_bytes = atomic_load_acq_64(&test_data.bytes_read);
+
+ KTEST_LOG(ctx,
+ "After 1s: wrote %llu events, read %llu times, %llu bytes",
+ (unsigned long long)final_written,
+ (unsigned long long)final_read,
+ (unsigned long long)final_bytes);
+
+ /* Verify progress was made */
+ KTEST_VERIFY(final_written > initial_written);
+ KTEST_VERIFY(final_bytes > initial_bytes);
+
+ /*
+ * Stop threads - wake reader if blocked, wait for it to exit (single
+ * reader)
+ */
+ test_data.done = 1;
+ wakeup(subscriber);
+ while (atomic_load_acq_int(&test_data.reader_exited) == 0)
+ tsleep(&test_data.reader_exited, 0, "evtlog_rdwait", hz / 10);
+
+ /* Drain remaining events (reader has exited, single reader) */
+ {
+ char drain_buf[8 * 1024];
+ size_t drain_read;
+
+ do {
+ drain_read = eventlog_read_into_buf(subscriber,
+ drain_buf, sizeof(drain_buf), 0);
+ } while (drain_read > 0);
+ }
+
+ mtx_destroy(&test_data.lock);
+ eventlog_session_destroy(session);
+ eventlog_subscriber_destroy(subscriber);
+ eventlog_provider_destroy(provider);
+
+ return (0);
+}
+
+/*
+ * Validates mid-read buffer swap scenario.
+ * Tests the case where a buffer drains during a read operation and triggers
+ * a swap, ensuring the swap happens correctly and ordering is maintained.
+ */
+KTEST_FUNC(subscriber_mid_read_swap)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct eventlog_subscriber *subscriber;
+ uint32_t test_id = 0x3000;
+ uint32_t test_data[10];
+ char *read_buf;
+ size_t read_buf_size = 64 * 1024;
+ ssize_t read_bytes;
+ struct eventlog_stats stats;
+ int i;
+
+ KTEST_LOG(ctx, "Testing mid-read buffer swap scenario");
+
+ /* malloc to avoid stack overflow in ktest taskqueue (small stack) */
+ read_buf = malloc(read_buf_size, M_EVENTLOG_TEST, M_WAITOK);
+ KTEST_NEQUAL(read_buf, NULL);
+
+ provider = test_create_provider("test_midswap", NULL, NULL);
+ KTEST_NEQUAL(provider, NULL);
+
+ subscriber = eventlog_subscriber_create_device(128 * 1024);
+ KTEST_NEQUAL(subscriber, NULL);
+
+ if (eventlog_subscriber_add_subscription(subscriber, "test_midswap",
+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, 0) != 0) {
+ free(read_buf, M_EVENTLOG_TEST);
+ eventlog_subscriber_destroy(subscriber);
+ eventlog_provider_destroy(provider);
+ return (EINVAL);
+ }
+
+ session = eventlog_session_create(provider, 0, true, NULL, 0);
+ if (session == NULL) {
+ free(read_buf, M_EVENTLOG_TEST);
+ eventlog_subscriber_destroy(subscriber);
+ eventlog_provider_destroy(provider);
+ return (EINVAL);
+ }
+
+ for (i = 0; i < 10; i++)
+ test_data[i] = i;
+
+ /* Fill buffer with events */
+ for (i = 0; i < 100; i++) {
+ eventlog_event_write(session, test_id + i, EVENTLOG_LEVEL_INFO,
+ 0xFFFFFFFF, test_data, sizeof(test_data));
+ }
+
+ /* Read a small chunk - this should trigger swap */
+ eventlog_subscriber_get_stats(subscriber, &stats);
+ read_bytes = eventlog_read_into_buf(subscriber, read_buf, 1024, 0);
+ KTEST_VERIFY(read_bytes > 0);
+ KTEST_LOG(ctx, "First read: %zd bytes, dropped %llu", read_bytes,
+ (unsigned long long)stats.dropped_events);
+
+ /* Write more events while reader buffer is being drained */
+ for (i = 100; i < 200; i++) {
+ eventlog_event_write(session, test_id + i, EVENTLOG_LEVEL_INFO,
+ 0xFFFFFFFF, test_data, sizeof(test_data));
+ }
+
+ /*
+ * Continue reading - this should drain the reader buffer and trigger
+ * swap.
+ */
+ eventlog_subscriber_get_stats(subscriber, &stats);
+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
+ read_buf_size, 0);
+ KTEST_VERIFY(read_bytes > 0);
+ KTEST_LOG(ctx, "Second read: %zd bytes, dropped %llu", read_bytes,
+ (unsigned long long)stats.dropped_events);
+
+ /* Write more events after swap */
+ for (i = 200; i < 250; i++) {
+ eventlog_event_write(session, test_id + i, EVENTLOG_LEVEL_INFO,
+ 0xFFFFFFFF, test_data, sizeof(test_data));
+ }
+
+ /* Read remaining events - should get events from swapped buffer */
+ eventlog_subscriber_get_stats(subscriber, &stats);
+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
+ read_buf_size, 0);
+ KTEST_VERIFY(read_bytes > 0);
+ KTEST_LOG(ctx, "Third read: %zd bytes, dropped %llu", read_bytes,
+ (unsigned long long)stats.dropped_events);
+
+ /* Verify buffer is empty */
+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
+ read_buf_size, 0);
+ KTEST_EQUAL(read_bytes, 0);
+
+ eventlog_session_destroy(session);
+ eventlog_subscriber_destroy(subscriber);
+ eventlog_provider_destroy(provider);
+ free(read_buf, M_EVENTLOG_TEST);
+
+ return (0);
+}
+
+/* Test data for buffer boundary stress test */
+struct boundary_test_data {
+ struct eventlog_session *session;
+ struct eventlog_subscriber *subscriber;
+ volatile int done;
+ int reader_exited; /* Protected by atomics; used as wait channel */
+ volatile uint64_t events_written;
+ struct mtx lock;
+};
+
+/* Writer thread that fills buffers exactly to boundaries */
+static void
+boundary_writer_thread(void *arg)
+{
+ struct boundary_test_data *data = (struct boundary_test_data *)arg;
+ uint32_t test_id = 0x4000;
+ uint32_t small_data[1] = {0xdeadbeef};
+ uint32_t large_data[100];
+ int i;
+
+ for (i = 0; i < 100; i++)
+ large_data[i] = i;
+
+ while (data->done == 0) {
+ /* Write small events to fill buffer precisely */
+ eventlog_event_write(data->session, test_id++,
+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF, small_data,
+ sizeof(small_data));
+ atomic_add_64(&data->events_written, 1);
+
+ /* Occasionally write larger events to test boundaries */
+ if ((test_id % 10) == 0) {
+ eventlog_event_write(data->session, test_id++,
+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF, large_data,
+ sizeof(large_data));
+ atomic_add_64(&data->events_written, 1);
+ }
+
+ kern_yield(PRI_UNCHANGED);
+ }
+
+ kthread_exit();
+}
+
+/* Reader thread that rapidly reads and triggers swaps */
+static void
+boundary_reader_thread(void *arg)
+{
+ struct boundary_test_data *data = (struct boundary_test_data *)arg;
+ char read_buf[8 * 1024];
+ size_t read_bytes;
+
+ while (data->done == 0) {
+ /* Read small chunks to trigger frequent swaps */
+ read_bytes = eventlog_read_into_buf(data->subscriber, read_buf,
+ 512, 0);
+ if (read_bytes > 0) {
+ /* Immediately read again to trigger swap */
+ read_bytes = eventlog_read_into_buf(data->subscriber,
+ read_buf, sizeof(read_buf), 0);
+ }
+ kern_yield(PRI_UNCHANGED);
+ }
+
+ atomic_store_rel_int(&data->reader_exited, 1);
+ wakeup(&data->reader_exited);
+ kthread_exit();
+}
+
+/*
+ * Stress test for buffer boundary conditions.
+ * Tests rapid writes and reads that fill buffers exactly to boundaries.
+ */
+KTEST_FUNC(subscriber_buffer_boundary_stress)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct eventlog_subscriber *subscriber;
+ struct boundary_test_data test_data;
+ struct thread *writer_thread, *reader_thread;
+ int error;
+ uint64_t initial_written;
+
+ KTEST_LOG(ctx, "Testing buffer boundary stress conditions");
+
+ provider = test_create_provider("test_bbstress", NULL, NULL);
+ KTEST_NEQUAL(provider, NULL);
+
+ /* Use 128KB buffer to trigger boundary conditions (above 64KB min) */
+ subscriber = eventlog_subscriber_create_device(128 * 1024);
+ KTEST_NEQUAL(subscriber, NULL);
+
+ if (eventlog_subscriber_add_subscription(subscriber, "test_bbstress",
+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, 0) != 0) {
+ eventlog_subscriber_destroy(subscriber);
+ eventlog_provider_destroy(provider);
+ return (EINVAL);
+ }
+
+ session = eventlog_session_create(provider, 0, true, NULL, 0);
+ KTEST_NEQUAL(session, NULL);
+
+ bzero(&test_data, sizeof(test_data));
+ test_data.session = session;
+ test_data.subscriber = subscriber;
+ test_data.done = 0;
+ test_data.reader_exited = 0;
+ test_data.events_written = 0;
+ mtx_init(&test_data.lock, "boundary_test", NULL, MTX_DEF);
+
+ /* Create writer thread */
+ error = kthread_add(boundary_writer_thread, &test_data, NULL,
+ &writer_thread, 0, 0, "evtlog_boundary_writer");
+ KTEST_EQUAL(error, 0);
+
+ /* Create reader thread */
+ error = kthread_add(boundary_reader_thread, &test_data, NULL,
+ &reader_thread, 0, 0, "evtlog_boundary_reader");
+ KTEST_EQUAL(error, 0);
+
+ /* Run for a period to exercise boundary conditions */
+ tsleep(&test_data, 0, "boundary_run", hz * 2); /* 2 seconds */
+
+ initial_written = atomic_load_acq_64(&test_data.events_written);
+ KTEST_LOG(ctx, "Wrote %llu events during boundary stress test",
+ (unsigned long long)initial_written);
+ KTEST_VERIFY(initial_written > 0);
+
+ /*
+ * Stop threads - wake reader if blocked, wait for it to exit (single
+ * reader)
+ */
+ test_data.done = 1;
+ wakeup(subscriber);
+ while (atomic_load_acq_int(&test_data.reader_exited) == 0)
+ tsleep(&test_data.reader_exited, 0, "evtlog_rdwait", hz / 10);
+
+ /* Drain remaining events (reader has exited, single reader) */
+ {
+ char drain_buf[8 * 1024];
+ size_t drain_read;
+
+ do {
+ drain_read = eventlog_read_into_buf(subscriber,
+ drain_buf, sizeof(drain_buf), 0);
+ } while (drain_read > 0);
+ }
+
+ mtx_destroy(&test_data.lock);
+ eventlog_session_destroy(session);
+ eventlog_subscriber_destroy(subscriber);
+ eventlog_provider_destroy(provider);
+
+ return (0);
+}
+
+/*
+ * Stress test that fills buffers exactly to capacity.
+ * Tests edge cases where write_pos approaches buffer_size.
+ */
+KTEST_FUNC(subscriber_buffer_fill_to_capacity)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct eventlog_subscriber *subscriber;
+ uint32_t test_id = 0x5000;
+ uint32_t test_data = 0x12345678;
+ char *read_buf;
+ size_t read_buf_size = 64 * 1024;
+ ssize_t read_bytes;
+ struct eventlog_stats stats;
+ size_t buffer_size_per_cpu = 128 * 1024;
+ size_t create_event_size;
+ size_t event_size;
+ size_t max_events;
+ size_t fill_count;
+ int i;
+
+ KTEST_LOG(ctx, "Testing buffer fill to exact capacity");
+
+ read_buf = malloc(read_buf_size, M_EVENTLOG_TEST, M_WAITOK);
+ KTEST_NEQUAL(read_buf, NULL);
+
+ provider = test_create_provider("test_bfill", NULL, NULL);
+ if (provider == NULL) {
+ free(read_buf, M_EVENTLOG_TEST);
+ return (EINVAL);
+ }
+
+ subscriber = eventlog_subscriber_create_device(buffer_size_per_cpu);
+ if (subscriber == NULL) {
+ free(read_buf, M_EVENTLOG_TEST);
+ eventlog_provider_destroy(provider);
+ return (EINVAL);
+ }
+
+ if (eventlog_subscriber_add_subscription(subscriber, "test_bfill",
+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, 0) != 0) {
+ free(read_buf, M_EVENTLOG_TEST);
+ eventlog_subscriber_destroy(subscriber);
+ eventlog_provider_destroy(provider);
+ return (EINVAL);
+ }
+
+ session = eventlog_session_create(provider, 0, true, NULL, 0);
+ if (session == NULL) {
+ free(read_buf, M_EVENTLOG_TEST);
+ eventlog_subscriber_destroy(subscriber);
+ eventlog_provider_destroy(provider);
+ return (EINVAL);
+ }
+
+ /*
+ * SESSION_CREATE is header-only (no payload), user events carry a
+ * uint32_t payload. Compute how many user events fit after the
+ * session_create event, leaving less than one event of slack.
+ */
+ create_event_size = sizeof(struct eventlog_event_header);
+ event_size = sizeof(struct eventlog_event_header) + sizeof(uint32_t);
+ max_events = (buffer_size_per_cpu - create_event_size) / event_size;
+
+ KTEST_LOG(ctx, "Event size: %zu bytes, create size: %zu bytes, "
+ "buffer size: %zu bytes, max user events: %zu",
+ event_size, create_event_size, buffer_size_per_cpu, max_events);
+
+ /*
+ * Fill buffer: session_create already wrote 1 event, then max_events
+ * user events to leave less than event_size bytes of slack.
+ */
+ for (i = 0; i < (int)max_events; i++) {
+ eventlog_event_write(session, test_id + i, EVENTLOG_LEVEL_INFO,
+ 0xFFFFFFFF, &test_data, sizeof(test_data));
+ }
+
+ /* Write one more - triggers proactive swap (SWAP_ALLOWED) */
+ eventlog_event_write(session, test_id + max_events,
+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF, &test_data, sizeof(test_data));
+
+ eventlog_subscriber_get_stats(subscriber, &stats);
+ KTEST_EQUAL(stats.dropped_events, 0);
+ KTEST_LOG(ctx,
+ "After first overflow (proactive swap): dropped %llu events",
+ (unsigned long long)stats.dropped_events);
+
+ /*
+ * Now fill the second buffer completely (1 event already there from
+ * overflow). Second buffer has no session_create, so it holds
+ * max_events+1 user events total (since buffer_size / event_size >
+ * max_events when create_event_size < event_size). But 1 is already
+ * there, so write max_events more.
+ */
+ fill_count = buffer_size_per_cpu / event_size - 1;
+ for (i = 0; i < (int)fill_count; i++) {
+ eventlog_event_write(session, test_id + 10000 + i,
+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF, &test_data,
+ sizeof(test_data));
+ }
+
+ /* Write one more - SWAP_ALLOWED cleared (reader idle), so dropped */
+ eventlog_event_write(session, test_id + 20000, EVENTLOG_LEVEL_INFO,
+ 0xFFFFFFFF, &test_data, sizeof(test_data));
+
+ eventlog_subscriber_get_stats(subscriber, &stats);
+ KTEST_EQUAL(stats.dropped_events, 1);
+ KTEST_LOG(ctx, "After second overflow (no swap): dropped %llu events",
+ (unsigned long long)stats.dropped_events);
+
+ /* Read all events from the reader buffer (filled by proactive swap) */
+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
+ read_buf_size, 0);
+ KTEST_VERIFY(read_bytes > 0);
+
+ /* Read again to get events from the second buffer */
+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
+ read_buf_size, 0);
+ KTEST_VERIFY(read_bytes > 0);
+
+ eventlog_session_destroy(session);
+ eventlog_subscriber_destroy(subscriber);
+ eventlog_provider_destroy(provider);
+ free(read_buf, M_EVENTLOG_TEST);
+
+ return (0);
+}
+
+/*
+ * Stress test with rapid buffer swaps.
+ * Writes events, reads partially, writes more, reads again - rapid swapping.
+ */
+KTEST_FUNC(subscriber_rapid_swap_stress)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct eventlog_subscriber *subscriber;
+ uint32_t test_id = 0x6000;
+ uint32_t test_data[10];
+ char *read_buf;
+ size_t read_buf_size = 64 * 1024;
+ ssize_t read_bytes;
+ int i, j;
+
+ KTEST_LOG(ctx, "Testing rapid buffer swap stress");
+
+ read_buf = malloc(read_buf_size, M_EVENTLOG_TEST, M_WAITOK);
+ KTEST_NEQUAL(read_buf, NULL);
+
+ provider = test_create_provider("test_rswap", NULL, NULL);
+ if (provider == NULL) {
+ free(read_buf, M_EVENTLOG_TEST);
+ return (EINVAL);
+ }
+
+ subscriber = eventlog_subscriber_create_device(128 * 1024);
+ if (subscriber == NULL) {
+ free(read_buf, M_EVENTLOG_TEST);
+ eventlog_provider_destroy(provider);
+ return (EINVAL);
+ }
+
+ if (eventlog_subscriber_add_subscription(subscriber, "test_rswap",
+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, 0) != 0) {
+ free(read_buf, M_EVENTLOG_TEST);
+ eventlog_subscriber_destroy(subscriber);
+ eventlog_provider_destroy(provider);
+ return (EINVAL);
+ }
+
+ session = eventlog_session_create(provider, 0, true, NULL, 0);
+ if (session == NULL) {
+ free(read_buf, M_EVENTLOG_TEST);
+ eventlog_subscriber_destroy(subscriber);
+ eventlog_provider_destroy(provider);
+ return (EINVAL);
+ }
+
+ for (i = 0; i < 10; i++)
+ test_data[i] = i;
+
+ /* Rapid cycle: write, read partially, write more, read again */
+ for (j = 0; j < 50; j++) {
+ /* Write a batch */
+ for (i = 0; i < 30; i++) {
+ eventlog_event_write(session, test_id++,
+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF, test_data,
+ sizeof(test_data));
+ }
+
+ /* Read a small chunk to trigger swap */
+ read_bytes = eventlog_read_into_buf(subscriber, read_buf, 1024,
+ 0);
+
+ /* Write more while reader buffer is being drained */
+ for (i = 0; i < 20; i++) {
+ eventlog_event_write(session, test_id++,
+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF, test_data,
+ sizeof(test_data));
+ }
+
+ /* Read remaining to drain reader buffer */
+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
+ read_buf_size, 0);
+ }
+
+ /* Final drain */
+ do {
+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
+ read_buf_size, 0);
+ } while (read_bytes > 0);
+
+ eventlog_session_destroy(session);
+ eventlog_subscriber_destroy(subscriber);
+ eventlog_provider_destroy(provider);
+ free(read_buf, M_EVENTLOG_TEST);
+
+ return (0);
+}
+
+/*
+ * Validates device subscriber buffer size validation.
+ * Buffer size must be between EVENTLOG_BUFFER_SIZE_MIN and
+ * EVENTLOG_BUFFER_SIZE_MAX inclusive.
+ */
+KTEST_FUNC(subscriber_create_device_invalid_size)
+{
+ struct eventlog_subscriber *subscriber;
+
+ KTEST_LOG(ctx, "Testing device subscriber buffer size validation");
+
+ /* Too small */
+ subscriber = eventlog_subscriber_create_device(
+ EVENTLOG_BUFFER_SIZE_MIN - 1);
+ KTEST_EQUAL(subscriber, NULL);
+
+ subscriber = eventlog_subscriber_create_device(0);
+ KTEST_EQUAL(subscriber, NULL);
+
+ /* Too large */
+ subscriber = eventlog_subscriber_create_device(
+ EVENTLOG_BUFFER_SIZE_MAX + 1);
+ KTEST_EQUAL(subscriber, NULL);
+
+ /* Valid boundaries */
+ subscriber = eventlog_subscriber_create_device(
+ EVENTLOG_BUFFER_SIZE_MIN);
+ KTEST_NEQUAL(subscriber, NULL);
+ eventlog_subscriber_destroy(subscriber);
+
+ subscriber = eventlog_subscriber_create_device(
+ EVENTLOG_BUFFER_SIZE_MAX);
+ KTEST_NEQUAL(subscriber, NULL);
+ eventlog_subscriber_destroy(subscriber);
+
+ return (0);
+}
+
+/*
+ * Validates that adding a subscription for a non-existent provider returns
+ * ENOENT.
+ */
+KTEST_FUNC(subscriber_add_subscription_nonexistent_provider)
+{
+ struct eventlog_subscriber *subscriber;
+ int error;
+
+ KTEST_LOG(ctx, "Testing subscription to non-existent provider");
+
+ subscriber = eventlog_subscriber_create_device(
+ EVENTLOG_SUBSCRIBER_BUFFER_SIZE_DEFAULT);
+ KTEST_NEQUAL(subscriber, NULL);
+
+ error = eventlog_subscriber_add_subscription(subscriber,
+ "nonexistent_provider_xyz", EVENTLOG_LEVEL_INFO, 0xFFFFFFFF, 0);
+ KTEST_EQUAL(error, ENOENT);
+
+ eventlog_subscriber_destroy(subscriber);
+
+ return (0);
+}
+
+/*
+ * Validates eventlog_subscriber_read error paths: EOPNOTSUPP and EAGAIN.
+ */
+KTEST_FUNC(subscriber_read_error_paths)
+{
+ struct eventlog_subscriber *subscriber;
+ struct uio uio;
+ struct iovec iov[2];
+ int error;
+
+ KTEST_LOG(ctx, "Testing subscriber read error paths");
+
+ subscriber = eventlog_subscriber_create_device(
+ EVENTLOG_SUBSCRIBER_BUFFER_SIZE_DEFAULT);
+ KTEST_NEQUAL(subscriber, NULL);
+
+ /* EOPNOTSUPP: multiple iovecs not supported */
+ iov[0].iov_base = malloc(1024, M_EVENTLOG_TEST, M_WAITOK);
+ iov[0].iov_len = 512;
+ iov[1].iov_base = (char *)iov[0].iov_base + 512;
+ iov[1].iov_len = 512;
+ uio.uio_iov = iov;
+ uio.uio_iovcnt = 2;
+ uio.uio_offset = 0;
+ uio.uio_resid = 1024;
+ uio.uio_segflg = UIO_SYSSPACE;
+ uio.uio_rw = UIO_READ;
+ uio.uio_td = curthread;
+
+ error = eventlog_subscriber_read(subscriber, &uio, 0);
+ KTEST_EQUAL(error, EOPNOTSUPP);
+
+ free(iov[0].iov_base, M_EVENTLOG_TEST);
+
+ /* EOPNOTSUPP: zero resid */
+ iov[0].iov_base = malloc(1024, M_EVENTLOG_TEST, M_WAITOK);
+ iov[0].iov_len = 1024;
+ uio.uio_iov = iov;
+ uio.uio_iovcnt = 1;
+ uio.uio_resid = 0;
+
+ error = eventlog_subscriber_read(subscriber, &uio, 0);
+ KTEST_EQUAL(error, EOPNOTSUPP);
+
+ free(iov[0].iov_base, M_EVENTLOG_TEST);
+
+ /* EAGAIN: FNONBLOCK with no data */
+ iov[0].iov_base = malloc(1024, M_EVENTLOG_TEST, M_WAITOK);
+ iov[0].iov_len = 1024;
+ uio.uio_iov = iov;
+ uio.uio_iovcnt = 1;
+ uio.uio_resid = 1024;
+
+ error = eventlog_subscriber_read(subscriber, &uio, FNONBLOCK);
+ KTEST_EQUAL(error, EAGAIN);
+
+ free(iov[0].iov_base, M_EVENTLOG_TEST);
+
+ eventlog_subscriber_destroy(subscriber);
+
+ return (0);
+}
+
+/*
+ * Validates that *_destroy with NULL pointer returns without crashing.
+ */
+KTEST_FUNC(null_pointer_destroy)
+{
+ KTEST_LOG(ctx, "Testing NULL pointer handling in destroy functions");
+
+ eventlog_provider_destroy(NULL);
+ eventlog_session_destroy(NULL);
+ eventlog_subscriber_destroy(NULL);
+
+ return (0);
+}
+
+/*
+ * Validates that events are filtered by level and keywords.
+ * Subscriber at INFO/0x1 should not receive VERBOSE events or events with
+ * non-matching keywords.
+ */
+KTEST_FUNC(subscriber_level_keyword_filtering)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct eventlog_subscriber *subscriber;
+ struct test_callback_data *callback_data;
+ uint32_t test_id = 0x1111;
+ uint32_t test_data = 0xdeadbeef;
+
+ KTEST_LOG(ctx, "Testing level and keyword filtering");
+
+ provider = test_create_provider("test_filter", NULL, NULL);
+ KTEST_NEQUAL(provider, NULL);
+
+ /* Subscriber wants INFO level, keyword 0x1 only */
+ subscriber = test_enable_provider_callback("test_filter",
+ EVENTLOG_LEVEL_INFO, 0x1, &callback_data);
+ KTEST_NEQUAL(subscriber, NULL);
+
+ session = eventlog_session_create(provider, 0, true, NULL, 0);
+ KTEST_NEQUAL(session, NULL);
+
+ /* Event at INFO level with keyword 0x1 - should be received */
+ eventlog_event_write(session, test_id, EVENTLOG_LEVEL_INFO, 0x1,
+ &test_data, sizeof(test_data));
+ {
+ uint32_t ec;
+ mtx_lock(&callback_data->lock);
+ ec = atomic_load_acq_32(&callback_data->event_count);
+ mtx_unlock(&callback_data->lock);
+ KTEST_EQUAL(ec, 1);
+ }
+
+ /* Event at VERBOSE level - filtered out (VERBOSE > INFO) */
+ eventlog_event_write(session, test_id + 1, EVENTLOG_LEVEL_VERBOSE, 0x1,
+ &test_data, sizeof(test_data));
+ {
+ uint32_t ec;
+ mtx_lock(&callback_data->lock);
+ ec = atomic_load_acq_32(&callback_data->event_count);
+ mtx_unlock(&callback_data->lock);
+ KTEST_EQUAL(ec, 1);
+ }
+
+ /* Event at INFO with keyword 0x2 only - filtered out (no key match) */
+ eventlog_event_write(session, test_id + 2, EVENTLOG_LEVEL_INFO, 0x2,
+ &test_data, sizeof(test_data));
+ {
+ uint32_t ec;
+ mtx_lock(&callback_data->lock);
+ ec = atomic_load_acq_32(&callback_data->event_count);
+ mtx_unlock(&callback_data->lock);
+ KTEST_EQUAL(ec, 1);
+ }
+
+ /* Event at INFO with keywords 0x1 | 0x2 - received (0x1 matches) */
+ eventlog_event_write(session, test_id + 3, EVENTLOG_LEVEL_INFO, 0x3,
+ &test_data, sizeof(test_data));
+ {
+ uint32_t ec;
+ mtx_lock(&callback_data->lock);
+ ec = atomic_load_acq_32(&callback_data->event_count);
+ mtx_unlock(&callback_data->lock);
+ KTEST_EQUAL(ec, 2);
+ }
+
+ eventlog_session_destroy(session);
+ eventlog_subscriber_destroy(subscriber);
+ mtx_destroy(&callback_data->lock);
+ free(callback_data, M_EVENTLOG_TEST);
+ eventlog_provider_destroy(provider);
+
+ return (0);
+}
+
+/*
+ * Validates that events exceeding UINT16_MAX are dropped silently.
+ */
+KTEST_FUNC(event_oversized_dropped)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct eventlog_subscriber *subscriber;
+ struct test_callback_data *callback_data;
+ uint8_t *large_payload;
+ size_t oversized_len;
+ uint32_t test_id = 0x9999;
+
+ KTEST_LOG(ctx, "Testing that oversized events are dropped");
+
+ /*
+ * total_size = sizeof(eventlog_event_header) + payload.
+ * Need total_size > UINT16_MAX (65535). Header is 32 bytes,
+ * so payload must be > 65535 - 32 = 65503. Use 65504.
+ */
+ oversized_len = 65504;
+
+ provider = test_create_provider("test_oversize", NULL, NULL);
+ KTEST_NEQUAL(provider, NULL);
+
+ subscriber = test_enable_provider_callback("test_oversize",
+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, &callback_data);
+ KTEST_NEQUAL(subscriber, NULL);
+
+ session = eventlog_session_create(provider, 0, true, NULL, 0);
+ KTEST_NEQUAL(session, NULL);
+
+ large_payload = malloc(oversized_len, M_EVENTLOG_TEST,
+ M_WAITOK | M_ZERO);
+ KTEST_NEQUAL(large_payload, NULL);
+
+ /* This event exceeds UINT16_MAX and should be dropped (no callback) */
+ eventlog_event_write(session, test_id, EVENTLOG_LEVEL_INFO, 0xFFFFFFFF,
+ large_payload, oversized_len);
+
+ /*
+ * Read without holding lock: callback is never invoked (event dropped
+ * before reaching subscribers). Holding the lock across KTEST_EQUAL
+ * can panic when kyua runs tests in taskqueue context (KTEST_LOG may
+ * sleep).
+ */
+ /* SESSION_CREATE only; oversized dropped */
+ KTEST_EQUAL(atomic_load_acq_32(&callback_data->event_count), 1);
+
+ free(large_payload, M_EVENTLOG_TEST);
+ eventlog_session_destroy(session);
+ eventlog_subscriber_destroy(subscriber);
+ mtx_destroy(&callback_data->lock);
+ free(callback_data, M_EVENTLOG_TEST);
+ eventlog_provider_destroy(provider);
+
+ return (0);
+}
+
+/*
+ * Validates zero-length payload and empty session_id.
+ */
+KTEST_FUNC(event_edge_cases_payload_session)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct eventlog_subscriber *subscriber;
+ struct test_callback_data *callback_data;
+ uint32_t test_id = 0x7777;
+
+ KTEST_LOG(ctx, "Testing zero-length payload and empty session_id");
+
+ provider = test_create_provider("test_edge", NULL, NULL);
+ KTEST_NEQUAL(provider, NULL);
+
+ subscriber = test_enable_provider_callback("test_edge",
+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, &callback_data);
+ KTEST_NEQUAL(subscriber, NULL);
+
+ /* Session with empty string session_id */
+ session = eventlog_session_create(provider, 0, true, NULL, 0);
+ KTEST_NEQUAL(session, NULL);
+
+ /* Event with zero-length payload (valid pointer, zero length) */
+ eventlog_event_write(session, test_id, EVENTLOG_LEVEL_INFO, 0xFFFFFFFF,
+ &test_id, 0);
+
+ /* Verify (read then unlock; KTEST_EQUAL may sleep) */
+ {
+ uint32_t ec, eid;
+ size_t plen;
+ mtx_lock(&callback_data->lock);
+ ec = atomic_load_acq_32(&callback_data->event_count);
+ eid = atomic_load_acq_32(&callback_data->last_event_id);
+ plen = atomic_load_acq_long(&callback_data->last_payload_size);
+ mtx_unlock(&callback_data->lock);
+ /* SESSION_CREATE + 1 user event (zero-length payload) */
+ KTEST_EQUAL(ec, 2);
+ KTEST_EQUAL(eid, test_id);
+ KTEST_EQUAL(plen, 0);
+ }
+
+ eventlog_session_destroy(session);
+ eventlog_subscriber_destroy(subscriber);
+ mtx_destroy(&callback_data->lock);
+ free(callback_data, M_EVENTLOG_TEST);
+ eventlog_provider_destroy(provider);
+
+ return (0);
+}
+
+/*
+ * Validates subscription update in place when re-subscribing to same provider.
+ */
+KTEST_FUNC(subscriber_subscription_update_in_place)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_subscriber *subscriber;
+ int error;
+
+ KTEST_LOG(ctx, "Testing subscription update in place");
+
+ provider = test_create_provider("test_subupd", NULL, NULL);
+ KTEST_NEQUAL(provider, NULL);
+
+ subscriber = eventlog_subscriber_create_device(
+ EVENTLOG_SUBSCRIBER_BUFFER_SIZE_DEFAULT);
+ KTEST_NEQUAL(subscriber, NULL);
+
+ /* First subscription */
+ error = eventlog_subscriber_add_subscription(subscriber, "test_subupd",
+ EVENTLOG_LEVEL_INFO, 0x1, 0);
+ KTEST_EQUAL(error, 0);
+ KTEST_EQUAL(eventlog_provider_get_level(provider), EVENTLOG_LEVEL_INFO);
+ KTEST_EQUAL(eventlog_provider_get_keywords(provider), 0x1);
+
+ /* Re-subscribe to same provider: should update in place, not add */
+ error = eventlog_subscriber_add_subscription(subscriber, "test_subupd",
+ EVENTLOG_LEVEL_VERBOSE, 0x7, 0);
+ KTEST_EQUAL(error, 0);
+ KTEST_EQUAL(eventlog_provider_get_level(provider),
+ EVENTLOG_LEVEL_VERBOSE);
+ KTEST_EQUAL(eventlog_provider_get_keywords(provider), 0x7);
+
+ eventlog_subscriber_destroy(subscriber);
+ eventlog_provider_destroy(provider);
+
+ return (0);
+}
+
+KTEST_FUNC(schema_generated_macros)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct eventlog_subscriber *subscriber;
+
+ KTEST_LOG(ctx, "Testing schema-generated macros");
+
+ /* Create provider */
+ provider = test_create_provider("test_schema", NULL, NULL);
+ KTEST_NEQUAL(provider, NULL);
+ session = eventlog_session_create(provider, 12345, true, NULL, 0);
+ KTEST_NEQUAL(session, NULL);
+
+ /* Test 1: Verify _ENABLED macro returns false when no subscribers */
+ KTEST_EQUAL(TEST_EVENTLOG_SIMPLE_EVENT_ENABLED(session), 0);
+ KTEST_EQUAL(TEST_EVENTLOG_STATUS_EVENT_ENABLED(session), 0);
+ KTEST_EQUAL(TEST_EVENTLOG_FLAGS_EVENT_ENABLED(session), 0);
+ KTEST_EQUAL(TEST_EVENTLOG_COMPLEX_EVENT_ENABLED(session), 0);
+
+ /* Test 2: Create callback subscriber with BASIC keyword, INFO level */
+ struct test_callback_data *callback_data;
+ subscriber = test_enable_provider_callback("test_schema",
+ EVENTLOG_LEVEL_INFO, TEST_EVENTLOG_KEYWORD_BASIC, &callback_data);
+ KTEST_NEQUAL(subscriber, NULL);
+ /* Provider enablement is auto-updated when subscription is added */
+
+ /* Verify _ENABLED macros work correctly */
+ /* INFO level, BASIC keyword */
+ KTEST_EQUAL(TEST_EVENTLOG_SIMPLE_EVENT_ENABLED(session), 1);
+ /* INFO level, BASIC keyword */
+ KTEST_EQUAL(TEST_EVENTLOG_STATUS_EVENT_ENABLED(session), 1);
+ /* VERBOSE level, ADVANCED keyword */
+ KTEST_EQUAL(TEST_EVENTLOG_FLAGS_EVENT_ENABLED(session), 0);
+ /* WARN level, COMPLEX keyword */
+ KTEST_EQUAL(TEST_EVENTLOG_COMPLEX_EVENT_ENABLED(session), 0);
+
+ /* Test 3: Use _LOG_ALWAYS macro (always logs regardless of enabled) */
+ TEST_EVENTLOG_SIMPLE_EVENT_LOG_ALWAYS(session, 0x12345678);
+ TEST_EVENTLOG_STATUS_EVENT_LOG_ALWAYS(session, 0xABCDEF00,
+ TEST_EVENTLOG_TEST_STATUS_RUNNING);
+
+ /*
+ * Verify events were received via callback (read then unlock;
+ * KTEST_EQUAL may sleep)
+ */
+ {
+ uint32_t ec;
+ mtx_lock(&callback_data->lock);
+ ec = atomic_load_acq_32(&callback_data->event_count);
+ mtx_unlock(&callback_data->lock);
+ /* Session created before subscriber; 2 LOG_ALWAYS events */
+ KTEST_EQUAL(ec, 2);
+ }
+
+ /* Reset callback data for next test */
+ mtx_lock(&callback_data->lock);
+ callback_data->event_count = 0;
+ mtx_unlock(&callback_data->lock);
+
+ /* Test 4: Use _LOG macro (should check enablement first) */
+ TEST_EVENTLOG_SIMPLE_EVENT_LOG(session, 0x87654321);
+ TEST_EVENTLOG_STATUS_EVENT_LOG(session, 0xFEDCBA00,
+ TEST_EVENTLOG_TEST_STATUS_SUCCESS);
+ TEST_EVENTLOG_FLAGS_EVENT_LOG(session, 0x11111111,
+ TEST_EVENTLOG_FLAG_FLAG_A | TEST_EVENTLOG_FLAG_FLAG_B);
+ TEST_EVENTLOG_COMPLEX_EVENT_LOG(session, 0x22222222, 0x33333333,
+ TEST_EVENTLOG_TEST_STATUS_RUNNING, TEST_EVENTLOG_FLAG_FLAG_C, -42);
+
+ /*
+ * Verify only enabled events were received (read then unlock;
+ * KTEST_EQUAL may sleep)
+ */
+ {
+ uint32_t ec;
+ mtx_lock(&callback_data->lock);
+ ec = atomic_load_acq_32(&callback_data->event_count);
+ mtx_unlock(&callback_data->lock);
+ /* Only SIMPLE and STATUS (session existed before subscriber) */
+ KTEST_EQUAL(ec, 2);
+ }
+
+ /* Test 5: Update subscriber to VERBOSE level with ADVANCED keyword */
+ eventlog_subscriber_destroy(subscriber);
+ mtx_destroy(&callback_data->lock);
+ free(callback_data, M_EVENTLOG_TEST);
+ subscriber = test_enable_provider_callback("test_schema",
+ EVENTLOG_LEVEL_VERBOSE, TEST_EVENTLOG_KEYWORD_ADVANCED,
+ &callback_data);
+ KTEST_NEQUAL(subscriber, NULL);
+ /* Provider enablement is auto-updated when subscription is added */
+
+ /* Verify FLAGS_EVENT is now enabled */
+ KTEST_EQUAL(TEST_EVENTLOG_FLAGS_EVENT_ENABLED(session), 1);
+ /* Still WARN level */
+ KTEST_EQUAL(TEST_EVENTLOG_COMPLEX_EVENT_ENABLED(session), 0);
+
+ /* Test 6: Update subscriber to WARN level with COMPLEX keyword */
+ eventlog_subscriber_destroy(subscriber);
+ mtx_destroy(&callback_data->lock);
+ free(callback_data, M_EVENTLOG_TEST);
+ subscriber = test_enable_provider_callback("test_schema",
+ EVENTLOG_LEVEL_WARN, TEST_EVENTLOG_KEYWORD_COMPLEX,
+ &callback_data);
+ KTEST_NEQUAL(subscriber, NULL);
+ /* Provider enablement is auto-updated when subscription is added */
+
+ /* Verify COMPLEX_EVENT is now enabled */
+ KTEST_EQUAL(TEST_EVENTLOG_COMPLEX_EVENT_ENABLED(session), 1);
+
+ /* Cleanup */
+ eventlog_session_destroy(session);
+ eventlog_subscriber_destroy(subscriber);
+ mtx_destroy(&callback_data->lock);
+ free(callback_data, M_EVENTLOG_TEST);
+ eventlog_provider_destroy(provider);
+ return (0);
+}
+
+/*
+ * Exercise the varlen trailing-array codegen for VARLEN_EVENT { id,
+ * count, values:uint64_t[count:8] }: producer macro with partial,
+ * clamped, and zero counts; accessor returning the trailing array.
+ */
+
+struct varlen_cb_data {
+ struct mtx lock;
+ uint32_t events;
+ uint32_t matched; /* events whose payload parsed correctly */
+ /* events whose tail/head mismatched expectation */
+ uint32_t mismatch;
+ size_t last_payload_size;
+ uint8_t last_count;
+ uint64_t last_first_value;
+ uint64_t last_last_value;
+};
+
+static void
+varlen_event_callback(const struct eventlog_event_header *hdr __unused,
+ const char *provider_name __unused, uint8_t provider_name_len __unused,
+ uint64_t session_id __unused,
+ const struct iovec *iov, int iovcnt, size_t payload_size,
+ void *callback_arg)
+{
+ struct varlen_cb_data *d = callback_arg;
+ /*
+ * The varlen producer emits a 2-segment iov: [head][tail].
+ * Compact it into a stack buffer for the generated accessor.
+ * Sized from the schema's declared max.
+ */
+ uint8_t buf[sizeof(struct test_eventlog_varlen_event) +
+ TEST_EVENTLOG_VARLEN_EVENT_VALUES_MAX * sizeof(uint64_t)];
+ const struct test_eventlog_varlen_event *evt;
+ const uint64_t *vals;
+ size_t off;
+ int i;
+
+ atomic_add_32(&d->events, 1);
+ if (payload_size < sizeof(*evt) || payload_size > sizeof(buf))
+ return;
+ off = 0;
+ for (i = 0; i < iovcnt; i++) {
+ if (iov[i].iov_len > 0) {
+ memcpy(buf + off, iov[i].iov_base, iov[i].iov_len);
+ off += iov[i].iov_len;
+ }
+ }
+ evt = (const struct test_eventlog_varlen_event *)buf;
+
+ vals = test_eventlog_varlen_event_values(evt, payload_size);
+ d->last_payload_size = payload_size;
+ d->last_count = evt->count;
+ if (evt->count == 0) {
+ /*
+ * No trailing elements expected; accessor may still succeed
+ * (payload_size == sizeof(head) + 0). Count as matched.
+ */
+ atomic_add_32(&d->matched, 1);
+ return;
+ }
+ if (vals == NULL) {
+ atomic_add_32(&d->mismatch, 1);
+ return;
+ }
+ d->last_first_value = vals[0];
+ d->last_last_value = vals[evt->count - 1];
+ atomic_add_32(&d->matched, 1);
+}
+
+KTEST_FUNC(schema_varlen_event)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct eventlog_subscriber *subscriber;
+ struct varlen_cb_data cb;
+ uint64_t payload[32];
+ uint32_t i;
+
+ KTEST_LOG(ctx, "Testing varlen trailing-array schema events");
+
+ bzero(&cb, sizeof(cb));
+ mtx_init(&cb.lock, "varlen_cb", NULL, MTX_DEF);
+
+ provider = test_create_provider("test_varlen", NULL, NULL);
+ KTEST_NEQUAL(provider, NULL);
+ session = eventlog_session_create(provider, 0x4711, true, NULL, 0);
+ KTEST_NEQUAL(session, NULL);
+
+ subscriber = eventlog_subscriber_create_callback(varlen_event_callback,
+ &cb);
+ KTEST_NEQUAL(subscriber, NULL);
+ KTEST_EQUAL(eventlog_subscriber_add_subscription(subscriber,
+ "test_varlen", EVENTLOG_LEVEL_INFO, TEST_EVENTLOG_KEYWORD_BASIC, 0),
+ 0);
+
+ KTEST_EQUAL(TEST_EVENTLOG_VARLEN_EVENT_ENABLED(session), 1);
+
+ /* Case 1: partial count (4 of 8). Accessor should return the tail. */
+ for (i = 0; i < 4; i++)
+ payload[i] = 0xAA00ULL + i;
+ TEST_EVENTLOG_VARLEN_EVENT_LOG(session, 0xD00D, 4, payload);
+
+ /* Case 2: count > MAX. Producer macro must clamp to 8. */
+ for (i = 0; i < 32; i++)
+ payload[i] = 0xBB00ULL + i;
+ TEST_EVENTLOG_VARLEN_EVENT_LOG(session, 0xBEEF, 32, payload);
+
+ /* Case 3: count == 0, values == NULL. No tail to copy. */
+ TEST_EVENTLOG_VARLEN_EVENT_LOG(session, 0xCAFE, 0, NULL);
+
+ /*
+ * Subscribers call us synchronously from the writer; no sleep needed.
+ * The session was created BEFORE the subscriber attached, so no
+ * SESSION_CREATE is delivered here -- we only see the 3 varlen
+ * events we logged.
+ */
+ KTEST_EQUAL(atomic_load_acq_32(&cb.events), 3);
+ KTEST_EQUAL(atomic_load_acq_32(&cb.matched), 3);
+ KTEST_EQUAL(atomic_load_acq_32(&cb.mismatch), 0);
+
+ /*
+ * Last event (count == 0) should have been delivered with a payload
+ * equal to exactly sizeof(struct test_eventlog_varlen_event).
+ */
+ KTEST_EQUAL((int)cb.last_count, 0);
+ KTEST_EQUAL((int)cb.last_payload_size,
+ (int)sizeof(struct test_eventlog_varlen_event));
+
+ /* Spot-check accessor robustness against a short payload. */
+ struct test_eventlog_varlen_event evt = { .id = 0, .count = 4 };
+ KTEST_EQUAL(test_eventlog_varlen_event_values(&evt, sizeof(evt)),
+ (const uint64_t *)NULL);
+
+ eventlog_subscriber_destroy(subscriber);
+ eventlog_session_destroy(session);
+ eventlog_provider_destroy(provider);
+ mtx_destroy(&cb.lock);
+ return (0);
+}
+
+/*
+ * Exercise eventlog_event_write_gather() directly: the iov is delivered
+ * to the callback unchanged and segments concatenate in order.
+ */
+
+/*
+ * Exercises a multi-segment iov whose compacted size exceeds any
+ * reasonable stack buffer in the framework. The iov path has no size
+ * ceiling short of UINT16_MAX (wire-format event_length cap).
+ */
+#define GATHER_BIG_PAYLOAD_SIZE 4096
+
+struct gather_cb_data {
+ uint32_t events;
+ uint32_t matched;
+ uint32_t mismatch;
+ size_t last_payload_size;
+ uint8_t last_first_byte;
+ uint8_t last_last_byte;
+};
+
+static void
+gather_event_callback(const struct eventlog_event_header *hdr __unused,
+ const char *provider_name __unused, uint8_t provider_name_len __unused,
+ uint64_t session_id __unused,
+ const struct iovec *iov, int iovcnt, size_t payload_size,
+ void *callback_arg)
+{
+ struct gather_cb_data *d = callback_arg;
+ const uint8_t *first_seg;
+ const uint8_t *last_seg;
+ int i;
+
+ atomic_add_32(&d->events, 1);
+ d->last_payload_size = payload_size;
+ if (payload_size == 0) {
+ d->last_first_byte = 0;
+ d->last_last_byte = 0;
+ atomic_add_32(&d->matched, 1);
+ return;
+ }
+ /*
+ * Walk iov to pick out first-byte-of-first-nonempty-segment and
+ * last-byte-of-last-nonempty-segment without compacting.
+ */
+ first_seg = NULL;
+ last_seg = NULL;
+ for (i = 0; i < iovcnt; i++) {
+ if (iov[i].iov_len == 0)
+ continue;
+ if (first_seg == NULL)
+ first_seg = iov[i].iov_base;
+ last_seg = (const uint8_t *)iov[i].iov_base +
+ iov[i].iov_len - 1;
+ }
+ if (first_seg == NULL || last_seg == NULL) {
+ atomic_add_32(&d->mismatch, 1);
+ return;
+ }
+ d->last_first_byte = first_seg[0];
+ d->last_last_byte = *last_seg;
+ atomic_add_32(&d->matched, 1);
+}
+
+KTEST_FUNC(event_write_gather)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct eventlog_subscriber *subscriber;
+ struct gather_cb_data cb;
+ struct iovec iov[3];
+ uint8_t seg0[8], seg1[16];
+ uint8_t *big;
+ size_t i;
+
+ KTEST_LOG(ctx, "Testing eventlog_event_write_gather() scatter/gather");
+
+ bzero(&cb, sizeof(cb));
+ provider = test_create_provider("test_gather", NULL, NULL);
+ KTEST_NEQUAL(provider, NULL);
+ session = eventlog_session_create(provider, 0x4712, true, NULL, 0);
+ KTEST_NEQUAL(session, NULL);
+ subscriber = eventlog_subscriber_create_callback(gather_event_callback,
+ &cb);
+ KTEST_NEQUAL(subscriber, NULL);
+ KTEST_EQUAL(eventlog_subscriber_add_subscription(subscriber,
+ "test_gather", EVENTLOG_LEVEL_INFO, TEST_EVENTLOG_KEYWORD_BASIC, 0),
+ 0);
+ eventlog_session_set_enabled(session, 1);
+
+ /* Case 1: iovcnt == 0, empty payload. */
+ eventlog_event_write_gather(session, 0x100, EVENTLOG_LEVEL_INFO,
+ TEST_EVENTLOG_KEYWORD_BASIC, NULL, 0);
+
+ /*
+ * Case 2: iovcnt == 1, contiguous buffer. Callback fast path: no
+ * compact copy, pointer equals iov[0].iov_base.
+ */
+ for (i = 0; i < sizeof(seg0); i++)
+ seg0[i] = (uint8_t)(0x10 + i);
+ iov[0].iov_base = seg0;
+ iov[0].iov_len = sizeof(seg0);
+ eventlog_event_write_gather(session, 0x101, EVENTLOG_LEVEL_INFO,
+ TEST_EVENTLOG_KEYWORD_BASIC, iov, 1);
+
+ /*
+ * Case 3: iovcnt == 2, small payload. Callback compact path runs on
+ * the on-stack buffer; verify order is seg0 then seg1.
+ */
+ for (i = 0; i < sizeof(seg1); i++)
+ seg1[i] = (uint8_t)(0xA0 + i);
+ iov[0].iov_base = seg0;
+ iov[0].iov_len = sizeof(seg0);
+ iov[1].iov_base = seg1;
+ iov[1].iov_len = sizeof(seg1);
+ eventlog_event_write_gather(session, 0x102, EVENTLOG_LEVEL_INFO,
+ TEST_EVENTLOG_KEYWORD_BASIC, iov, 2);
+
+ /*
+ * Case 4: iovcnt == 3, large multi-segment payload. The framework
+ * passes the iov through unchanged; the callback sees three
+ * segments and reports the first byte of seg0 and the last byte
+ * of seg1. Nothing is dropped and no allocation happens.
+ */
+ big = malloc(GATHER_BIG_PAYLOAD_SIZE, M_EVENTLOG_TEST, M_WAITOK);
+ for (i = 0; i < GATHER_BIG_PAYLOAD_SIZE; i++)
+ big[i] = (uint8_t)(i & 0xFF);
+ iov[0].iov_base = seg0;
+ iov[0].iov_len = sizeof(seg0); /* bytes 0x10..0x17 */
+ iov[1].iov_base = big;
+ iov[1].iov_len = GATHER_BIG_PAYLOAD_SIZE; /* 0x00..0xFF... */
+ iov[2].iov_base = seg1;
+ iov[2].iov_len = sizeof(seg1); /* bytes 0xA0..0xAF */
+ eventlog_event_write_gather(session, 0x103, EVENTLOG_LEVEL_INFO,
+ TEST_EVENTLOG_KEYWORD_BASIC, iov, 3);
+
+ /* Four events, all matched (no mismatch, nothing dropped). */
+ KTEST_EQUAL(atomic_load_acq_32(&cb.events), 4);
+ KTEST_EQUAL(atomic_load_acq_32(&cb.matched), 4);
+ KTEST_EQUAL(atomic_load_acq_32(&cb.mismatch), 0);
+
+ {
+ struct eventlog_stats stats;
+ eventlog_subscriber_get_stats(subscriber, &stats);
+ KTEST_EQUAL(stats.dropped_events, 0);
+ }
+
+ /* Last event: first byte from seg0, last byte from seg1's end. */
+ KTEST_EQUAL((int)cb.last_payload_size,
+ (int)(sizeof(seg0) + GATHER_BIG_PAYLOAD_SIZE + sizeof(seg1)));
+ KTEST_EQUAL((int)cb.last_first_byte, 0x10);
+ KTEST_EQUAL((int)cb.last_last_byte,
+ (int)(uint8_t)(0xA0 + sizeof(seg1) - 1));
+
+ free(big, M_EVENTLOG_TEST);
+ eventlog_subscriber_destroy(subscriber);
+ eventlog_session_destroy(session);
+ eventlog_provider_destroy(provider);
+ return (0);
+}
+
+/* ===== Lock-free per-CPU buffer tests ===== */
+
+/* Thread data for multi-writer tests */
+struct lockfree_writer_data {
+ struct eventlog_session *session;
+ /* Barrier: all threads wait until set */
+ int *go;
+ int done;
+ uint32_t thread_idx;
+ uint32_t num_events;
+ uint32_t events_written;
+};
+
+static void
+lockfree_writer_thread(void *arg)
+{
+ struct lockfree_writer_data *data = (struct lockfree_writer_data *)arg;
+ uint32_t event_data[2];
+ uint32_t i;
+
+ /*
+ * Sleep until all threads are ready. Using tsleep instead of
+ * cpu_spinwait avoids deadlocking on systems with fewer CPUs
+ * than writer threads (busy-spinning writers would monopolize
+ * all CPUs, preventing the main thread from setting go).
+ */
+ while (atomic_load_acq_32((volatile uint32_t *)data->go) == 0)
+ tsleep(data->go, 0, "lf_go", 1);
+
+ for (i = 0; i < data->num_events; i++) {
+ event_data[0] = data->thread_idx;
+ event_data[1] = i;
+ eventlog_event_write(data->session,
+ (data->thread_idx << 16) | i,
+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF,
+ event_data, sizeof(event_data));
+ data->events_written++;
+ }
+
+ atomic_store_rel_32((volatile uint32_t *)&data->done, 1);
+ wakeup(&data->done);
+ kthread_exit();
+}
+
+/*
+ * Stress the lock-free write/commit path by having many writers
+ * concurrently write to the same device subscriber. All writers start
+ * simultaneously to maximize contention on per-CPU buffers.
+ */
+KTEST_FUNC(lockfree_many_concurrent_writers)
+{
+#define LF_NUM_WRITERS 8
+#define LF_EVENTS_PER_WRITER 500
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct eventlog_subscriber *subscriber;
+ struct lockfree_writer_data writers[LF_NUM_WRITERS];
+ struct thread *threads[LF_NUM_WRITERS];
+ int go = 0;
+ struct eventlog_stats stats;
+ char *read_buf;
+ size_t read_buf_size = 256 * 1024;
+ size_t total_read = 0;
+ size_t read_bytes;
+ int i, error;
+
+ KTEST_LOG(ctx,
+ "Testing lock-free concurrent writers (%d threads, %d events each)",
+ LF_NUM_WRITERS, LF_EVENTS_PER_WRITER);
+
+ read_buf = malloc(read_buf_size, M_EVENTLOG_TEST, M_WAITOK);
+ KTEST_NEQUAL(read_buf, NULL);
+
+ provider = test_create_provider("test_lf_many", NULL, NULL);
+ KTEST_NEQUAL(provider, NULL);
+
+ subscriber = eventlog_subscriber_create_device(256 * 1024);
+ KTEST_NEQUAL(subscriber, NULL);
+
+ error = eventlog_subscriber_add_subscription(subscriber, "test_lf_many",
+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, 0);
+ KTEST_EQUAL(error, 0);
+
+ session = eventlog_session_create(provider, 0, true, NULL, 0);
+ KTEST_NEQUAL(session, NULL);
+
+ /* Create all writer threads (they spin-wait on go) */
+ for (i = 0; i < LF_NUM_WRITERS; i++) {
+ bzero(&writers[i], sizeof(writers[i]));
+ writers[i].session = session;
+ writers[i].go = &go;
+ writers[i].thread_idx = i;
+ writers[i].num_events = LF_EVENTS_PER_WRITER;
+ error = kthread_add(lockfree_writer_thread, &writers[i], NULL,
+ &threads[i], 0, 0, "lf_writer_%d", i);
+ KTEST_EQUAL(error, 0);
+ }
+
+ /* Release all writers simultaneously */
+ atomic_store_rel_32((volatile uint32_t *)&go, 1);
+ wakeup(&go);
+
+ /* Wait for all writers to finish */
+ for (i = 0; i < LF_NUM_WRITERS; i++) {
+ while (atomic_load_acq_32(
+ (volatile uint32_t *)&writers[i].done) == 0)
+ tsleep(&writers[i].done, 0, "lf_wait", hz / 10);
+ KTEST_EQUAL(writers[i].events_written, LF_EVENTS_PER_WRITER);
+ }
+
+ /* Read all events and verify total count */
+ do {
+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
+ read_buf_size, 0);
+ total_read += read_bytes;
+ } while (read_bytes > 0);
+
+ eventlog_subscriber_get_stats(subscriber, &stats);
+ KTEST_LOG(ctx, "Total bytes read: %zu, dropped events: %llu",
+ total_read, (unsigned long long)stats.dropped_events);
+ KTEST_VERIFY(total_read > 0);
+
+ /*
+ * With a 256KB buffer, some events may be dropped on small CPUs where
+ * all threads hit the same per-CPU buffer. That's fine - the test
+ * validates no crashes, no corruption (INVARIANTS checks), and that
+ * written + dropped == total attempted.
+ */
+ /* +1 for SESSION_CREATE */
+ uint64_t total_attempted =
+ (uint64_t)LF_NUM_WRITERS * LF_EVENTS_PER_WRITER + 1;
+ KTEST_LOG(ctx, "Total attempted: %llu, dropped: %llu",
+ (unsigned long long)total_attempted,
+ (unsigned long long)stats.dropped_events);
+
+ eventlog_session_destroy(session);
+ eventlog_subscriber_destroy(subscriber);
+ eventlog_provider_destroy(provider);
+ free(read_buf, M_EVENTLOG_TEST);
+
+ return (0);
+#undef LF_NUM_WRITERS
+#undef LF_EVENTS_PER_WRITER
+}
+
+/*
+ * Stress the writer/swap contention path: many writers + a reader doing
+ * rapid swaps. This exercises the commit CAS retry path when a reader
+ * swap races with a writer's commit.
+ */
+struct lockfree_swap_writer_data {
+ struct eventlog_session *session;
+ int *stop;
+ uint64_t events_written;
+ int exited;
+};
+
+static void
+lockfree_swap_writer(void *arg)
+{
+ struct lockfree_swap_writer_data *data = arg;
+ uint32_t payload = 0;
+
+ while (atomic_load_acq_32((volatile uint32_t *)data->stop) == 0) {
+ eventlog_event_write(data->session, 0x1000 + (payload & 0xFF),
+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF,
+ &payload, sizeof(payload));
+ atomic_add_64(&data->events_written, 1);
+ payload++;
+ kern_yield(PRI_UNCHANGED);
+ }
+
+ atomic_store_rel_32((volatile uint32_t *)&data->exited, 1);
+ wakeup(&data->exited);
+ kthread_exit();
+}
+
+static void
+lockfree_stop_callout(void *arg)
+{
+ int *stop = arg;
+
+ atomic_store_rel_32((volatile uint32_t *)stop, 1);
+ wakeup(stop);
+}
+
+KTEST_FUNC(lockfree_writer_swap_contention)
+{
+#define LFSW_NUM_WRITERS 4
+#define LFSW_RUN_SECONDS 3
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct eventlog_subscriber *subscriber;
+ struct lockfree_swap_writer_data writers[LFSW_NUM_WRITERS];
+ struct thread *threads[LFSW_NUM_WRITERS];
+ int stop = 0;
+ struct callout stop_timer;
+ char *read_buf;
+ size_t read_buf_size = 64 * 1024;
+ size_t total_bytes_read = 0;
+ size_t read_bytes;
+ uint64_t swap_iterations = 0;
+ struct eventlog_stats stats;
+ int i, error;
+
+ KTEST_LOG(ctx,
+ "Testing lock-free writer/swap contention (%d writers, %d seconds)",
+ LFSW_NUM_WRITERS, LFSW_RUN_SECONDS);
+
+ read_buf = malloc(read_buf_size, M_EVENTLOG_TEST, M_WAITOK);
+ KTEST_NEQUAL(read_buf, NULL);
+
+ provider = test_create_provider("test_lf_swap", NULL, NULL);
+ KTEST_NEQUAL(provider, NULL);
+
+ /* 128KB buffer to trigger frequent swaps (above 64KB minimum) */
+ subscriber = eventlog_subscriber_create_device(128 * 1024);
+ KTEST_NEQUAL(subscriber, NULL);
+
+ error = eventlog_subscriber_add_subscription(subscriber, "test_lf_swap",
+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, 0);
+ KTEST_EQUAL(error, 0);
+
+ session = eventlog_session_create(provider, 0, true, NULL, 0);
+ KTEST_NEQUAL(session, NULL);
+
+ /* Start writer threads */
+ for (i = 0; i < LFSW_NUM_WRITERS; i++) {
+ bzero(&writers[i], sizeof(writers[i]));
+ writers[i].session = session;
+ writers[i].stop = &stop;
+ error = kthread_add(lockfree_swap_writer, &writers[i], NULL,
+ &threads[i], 0, 0, "lfsw_writer_%d", i);
+ KTEST_EQUAL(error, 0);
+ }
+
+ /*
+ * Use a callout to set stop from softclock context. On a 2-CPU system,
+ * writers in tight loops can starve the main thread on the run queue,
+ * preventing it from ever executing stop=1. The callout fires from
+ * timer interrupt context, bypassing scheduler contention.
+ */
+ callout_init(&stop_timer, 1);
+ callout_reset(&stop_timer, hz * LFSW_RUN_SECONDS,
+ lockfree_stop_callout, &stop);
+
+ /* Reader loop: read rapidly to trigger swaps while writers active */
+ while (atomic_load_acq_32((volatile uint32_t *)&stop) == 0) {
+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
+ read_buf_size, 0);
+ if (read_bytes > 0) {
+ total_bytes_read += read_bytes;
+ swap_iterations++;
+ }
+ tsleep(&stop, 0, "lfsw_rd", 1);
+ }
+
+ callout_drain(&stop_timer);
+
+ /* Wait for writers to exit */
+ for (i = 0; i < LFSW_NUM_WRITERS; i++) {
+ while (atomic_load_acq_32(
+ (volatile uint32_t *)&writers[i].exited) == 0)
+ tsleep(&writers[i].exited, 0, "lfsw_wait", hz / 10);
+ }
+
+ /* Drain remaining */
+ do {
+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
+ read_buf_size, 0);
+ total_bytes_read += read_bytes;
+ } while (read_bytes > 0);
+
+ eventlog_subscriber_get_stats(subscriber, &stats);
+
+ uint64_t total_written = 0;
+ for (i = 0; i < LFSW_NUM_WRITERS; i++)
+ total_written += writers[i].events_written;
+
+ KTEST_LOG(ctx, "Writers produced %llu events, reader did %llu swaps, "
+ "read %zu bytes, dropped %llu",
+ (unsigned long long)total_written,
+ (unsigned long long)swap_iterations,
+ total_bytes_read,
+ (unsigned long long)stats.dropped_events);
+
+ KTEST_VERIFY(total_written > 0);
+ KTEST_VERIFY(total_bytes_read > 0);
+ KTEST_VERIFY(swap_iterations > 0);
+
+ eventlog_session_destroy(session);
+ eventlog_subscriber_destroy(subscriber);
+ eventlog_provider_destroy(provider);
+ free(read_buf, M_EVENTLOG_TEST);
+
+ return (0);
+#undef LFSW_NUM_WRITERS
+#undef LFSW_RUN_SECONDS
+}
+
+/*
+ * Test buffer-full contention: tiny buffer + many writers to force the
+ * buffer-full swap/drop path under contention. Verifies no events are
+ * corrupted despite heavy drops.
+ */
+KTEST_FUNC(lockfree_buffer_full_contention)
+{
+#define LFBF_NUM_WRITERS 4
+#define LFBF_EVENTS_PER_WRITER 5000
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct eventlog_subscriber *subscriber;
+ struct lockfree_writer_data writers[LFBF_NUM_WRITERS];
+ struct thread *threads[LFBF_NUM_WRITERS];
+ int go = 0;
+ struct eventlog_stats stats;
+ char *read_buf;
+ size_t read_buf_size = 64 * 1024;
+ size_t total_read = 0;
+ size_t read_bytes;
+ int i, error;
+
+ KTEST_LOG(ctx, "Testing lock-free buffer full contention (%d writers, "
+ "%d events each, 128KB buffer)",
+ LFBF_NUM_WRITERS, LFBF_EVENTS_PER_WRITER);
+
+ read_buf = malloc(read_buf_size, M_EVENTLOG_TEST, M_WAITOK);
+ KTEST_NEQUAL(read_buf, NULL);
+
+ provider = test_create_provider("test_lf_bfull", NULL, NULL);
+ KTEST_NEQUAL(provider, NULL);
+
+ /* 128KB buffer - will overflow quickly with concurrent writers */
+ subscriber = eventlog_subscriber_create_device(128 * 1024);
+ KTEST_NEQUAL(subscriber, NULL);
+
+ error = eventlog_subscriber_add_subscription(subscriber,
+ "test_lf_bfull", EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, 0);
+ KTEST_EQUAL(error, 0);
+
+ session = eventlog_session_create(provider, 0, true, NULL, 0);
+ KTEST_NEQUAL(session, NULL);
+
+ /* Drain SESSION_CREATE */
+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
+ read_buf_size, 0);
+
+ /* Create all writer threads */
+ for (i = 0; i < LFBF_NUM_WRITERS; i++) {
+ bzero(&writers[i], sizeof(writers[i]));
+ writers[i].session = session;
+ writers[i].go = &go;
+ writers[i].thread_idx = i;
+ writers[i].num_events = LFBF_EVENTS_PER_WRITER;
+ error = kthread_add(lockfree_writer_thread, &writers[i], NULL,
+ &threads[i], 0, 0, "lfbf_writer_%d", i);
+ KTEST_EQUAL(error, 0);
+ }
+
+ /* Release all writers */
+ atomic_store_rel_32((volatile uint32_t *)&go, 1);
+ wakeup(&go);
+
+ /* Wait for completion */
+ for (i = 0; i < LFBF_NUM_WRITERS; i++) {
+ while (atomic_load_acq_32(
+ (volatile uint32_t *)&writers[i].done) == 0)
+ tsleep(&writers[i].done, 0, "lfbf_wait", hz / 10);
+ }
+
+ /* Read whatever survived */
+ do {
+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
+ read_buf_size, 0);
+ total_read += read_bytes;
+ } while (read_bytes > 0);
+
+ eventlog_subscriber_get_stats(subscriber, &stats);
+
+ uint64_t total_attempted =
+ (uint64_t)LFBF_NUM_WRITERS * LFBF_EVENTS_PER_WRITER;
+ KTEST_LOG(ctx, "Attempted %llu events, dropped %llu, read %zu bytes",
+ (unsigned long long)total_attempted,
+ (unsigned long long)stats.dropped_events,
+ total_read);
+
+ /*
+ * With a 128KB buffer and no reader draining during writes, almost all
+ * events should be dropped. The key assertion is that we didn't crash
+ * and INVARIANTS didn't fire.
+ */
+ KTEST_VERIFY(stats.dropped_events > 0);
+
+ /*
+ * Validate that events that were read are well-formed by reading with
+ * INVARIANTS buffer validation (already baked into eventlog_read).
+ */
+
+ eventlog_session_destroy(session);
+ eventlog_subscriber_destroy(subscriber);
+ eventlog_provider_destroy(provider);
+ free(read_buf, M_EVENTLOG_TEST);
+
+ return (0);
+#undef LFBF_NUM_WRITERS
+#undef LFBF_EVENTS_PER_WRITER
+}
+
+/*
+ * Test data integrity under concurrent lock-free writes: many writers +
+ * concurrent reader, verify every event read back has valid structure
+ * (correct event_length, recognizable payload pattern). This catches
+ * torn writes or commit ordering bugs.
+ */
+struct lockfree_integrity_reader_data {
+ struct eventlog_subscriber *subscriber;
+ int *stop;
+ uint64_t events_validated;
+ uint64_t bytes_read;
+ uint64_t corrupt_events;
+ int exited;
+};
+
+static void
+lockfree_integrity_reader(void *arg)
+{
+ struct lockfree_integrity_reader_data *data = arg;
+ char *read_buf;
+ size_t read_buf_size = 64 * 1024;
+
+ read_buf = malloc(read_buf_size, M_EVENTLOG_TEST, M_WAITOK);
+
+ while (atomic_load_acq_32((volatile uint32_t *)data->stop) == 0) {
+ size_t read_bytes = eventlog_read_into_buf(data->subscriber,
+ read_buf, read_buf_size, 0);
+ if (read_bytes == 0) {
+ kern_yield(PRI_UNCHANGED);
+ continue;
+ }
+
+ data->bytes_read += read_bytes;
+
+ /* Walk each event and validate structure */
+ size_t offset = 0;
+ while (offset + sizeof(struct eventlog_event_header) <=
+ read_bytes) {
+ struct eventlog_event_header hdr;
+ memcpy(&hdr, read_buf + offset, sizeof(hdr));
+
+ if (hdr.event_length <
+ sizeof(struct eventlog_event_header) ||
+ offset + hdr.event_length > read_bytes) {
+ data->corrupt_events++;
+ break;
+ }
+
+ data->events_validated++;
+ offset += hdr.event_length;
+ }
+ }
+
+ free(read_buf, M_EVENTLOG_TEST);
+ atomic_store_rel_32((volatile uint32_t *)&data->exited, 1);
+ wakeup(&data->exited);
+ kthread_exit();
+}
+
+KTEST_FUNC(lockfree_data_integrity_under_contention)
+{
+#define LFDI_NUM_WRITERS 4
+#define LFDI_RUN_SECONDS 3
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct eventlog_subscriber *subscriber;
+ struct lockfree_swap_writer_data writers[LFDI_NUM_WRITERS];
+ struct lockfree_integrity_reader_data reader_data;
+ struct thread *writer_threads[LFDI_NUM_WRITERS];
+ struct thread *reader_thread;
+ int stop = 0;
+ struct callout stop_timer;
+ struct eventlog_stats stats;
+ int i, error;
+
+ KTEST_LOG(ctx,
+ "Testing lock-free data integrity (%d writers + reader, "
+ "%d seconds)",
+ LFDI_NUM_WRITERS, LFDI_RUN_SECONDS);
+
+ provider = test_create_provider("test_lf_integ", NULL, NULL);
+ KTEST_NEQUAL(provider, NULL);
+
+ /* 128KB buffer: holds some events but small enough to swap often */
+ subscriber = eventlog_subscriber_create_device(128 * 1024);
+ KTEST_NEQUAL(subscriber, NULL);
+
+ error = eventlog_subscriber_add_subscription(subscriber,
+ "test_lf_integ", EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, 0);
+ KTEST_EQUAL(error, 0);
+
+ session = eventlog_session_create(provider, 0, true, NULL, 0);
+ KTEST_NEQUAL(session, NULL);
+
+ /* Start reader */
+ bzero(&reader_data, sizeof(reader_data));
+ reader_data.subscriber = subscriber;
+ reader_data.stop = &stop;
+ error = kthread_add(lockfree_integrity_reader, &reader_data, NULL,
+ &reader_thread, 0, 0, "lfdi_reader");
+ KTEST_EQUAL(error, 0);
+
+ /* Start writers */
+ for (i = 0; i < LFDI_NUM_WRITERS; i++) {
+ bzero(&writers[i], sizeof(writers[i]));
+ writers[i].session = session;
+ writers[i].stop = &stop;
+ error = kthread_add(lockfree_swap_writer, &writers[i], NULL,
+ &writer_threads[i], 0, 0, "lfdi_writer_%d", i);
+ KTEST_EQUAL(error, 0);
+ }
+
+ /*
+ * Use a callout to set stop from softclock context. On a 2-CPU system,
+ * writers in tight loops can starve the main thread on the run queue,
+ * preventing it from ever executing stop=1. The callout fires from
+ * timer interrupt context, bypassing scheduler contention.
+ */
+ callout_init(&stop_timer, 1);
+ callout_reset(&stop_timer, hz * LFDI_RUN_SECONDS,
+ lockfree_stop_callout, &stop);
+
+ while (atomic_load_acq_32((volatile uint32_t *)&stop) == 0)
+ tsleep(&stop, 0, "lfdi_run", hz);
+
+ callout_drain(&stop_timer);
+ wakeup(subscriber); /* Wake reader if sleeping */
+
+ for (i = 0; i < LFDI_NUM_WRITERS; i++) {
+ while (atomic_load_acq_32(
+ (volatile uint32_t *)&writers[i].exited) == 0)
+ tsleep(&writers[i].exited, 0, "lfdi_ww", hz / 10);
+ }
+ while (atomic_load_acq_32(
+ (volatile uint32_t *)&reader_data.exited) == 0)
+ tsleep(&reader_data.exited, 0, "lfdi_rw", hz / 10);
+
+ eventlog_subscriber_get_stats(subscriber, &stats);
+
+ uint64_t total_written = 0;
+ for (i = 0; i < LFDI_NUM_WRITERS; i++)
+ total_written += writers[i].events_written;
+
+ KTEST_LOG(ctx, "Writers: %llu events. Reader: validated %llu events, "
+ "%llu bytes, %llu corrupt. Dropped: %llu",
+ (unsigned long long)total_written,
+ (unsigned long long)reader_data.events_validated,
+ (unsigned long long)reader_data.bytes_read,
+ (unsigned long long)reader_data.corrupt_events,
+ (unsigned long long)stats.dropped_events);
+
+ KTEST_VERIFY(total_written > 0);
+ KTEST_VERIFY(reader_data.events_validated > 0);
+ KTEST_EQUAL(reader_data.corrupt_events, 0);
+
+ eventlog_session_destroy(session);
+ eventlog_subscriber_destroy(subscriber);
+ eventlog_provider_destroy(provider);
+
+ return (0);
+#undef LFDI_NUM_WRITERS
+#undef LFDI_RUN_SECONDS
+}
+
+/*
+ * Test reader-writer swap race: reader aggressively swaps buffers while
+ * writers are mid-write. With a tiny buffer, the reader swaps frequently,
+ * maximizing the chance the reader's swap CAS races with a writer's commit
+ * CAS. The writer must detect the swap (active buffer changed) and redo
+ * the write to the correct buffer. Validates no panics (MPASS), no data
+ * corruption, and all events are properly readable.
+ */
+KTEST_FUNC(lockfree_reader_writer_swap_race)
+{
+#define LFRW_NUM_WRITERS 4
+#define LFRW_RUN_SECONDS 3
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct eventlog_subscriber *subscriber;
+ struct lockfree_swap_writer_data writers[LFRW_NUM_WRITERS];
+ struct thread *threads[LFRW_NUM_WRITERS];
+ int stop = 0;
+ struct callout stop_timer;
+ char *read_buf;
+ size_t read_buf_size = 4096;
+ size_t total_bytes_read = 0;
+ size_t read_bytes;
+ uint64_t read_iterations = 0;
+ struct eventlog_stats stats;
+ int i, error;
+
+ KTEST_LOG(ctx,
+ "Testing reader-writer swap race (%d writers, %d seconds, "
+ "128KB buffer)", LFRW_NUM_WRITERS, LFRW_RUN_SECONDS);
+
+ read_buf = malloc(read_buf_size, M_EVENTLOG_TEST, M_WAITOK);
+ KTEST_NEQUAL(read_buf, NULL);
+
+ provider = test_create_provider("test_lf_race", NULL, NULL);
+ KTEST_NEQUAL(provider, NULL);
+
+ /* 128KB buffer: forces frequent swaps, maximizing race window */
+ subscriber = eventlog_subscriber_create_device(128 * 1024);
+ KTEST_NEQUAL(subscriber, NULL);
+
+ error = eventlog_subscriber_add_subscription(subscriber, "test_lf_race",
+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, 0);
+ KTEST_EQUAL(error, 0);
+
+ session = eventlog_session_create(provider, 0, true, NULL, 0);
+ KTEST_NEQUAL(session, NULL);
+
+ for (i = 0; i < LFRW_NUM_WRITERS; i++) {
+ bzero(&writers[i], sizeof(writers[i]));
+ writers[i].session = session;
+ writers[i].stop = &stop;
+ error = kthread_add(lockfree_swap_writer, &writers[i], NULL,
+ &threads[i], 0, 0, "lfrw_writer_%d", i);
+ KTEST_EQUAL(error, 0);
+ }
+
+ KTEST_LOG(ctx, "checkpoint: writers started, arming stop callout");
+
+ callout_init(&stop_timer, 1);
+ callout_reset(&stop_timer, hz * LFRW_RUN_SECONDS,
+ lockfree_stop_callout, &stop);
+
+ KTEST_LOG(ctx, "checkpoint: entering reader loop");
+
+ /*
+ * Reader loop: read as fast as possible (no tsleep) to maximize
+ * the chance of swapping while a writer is mid-commit.
+ */
+ while (atomic_load_acq_32((volatile uint32_t *)&stop) == 0) {
+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
+ read_buf_size, FNONBLOCK);
+ if (read_bytes > 0) {
+ total_bytes_read += read_bytes;
+ read_iterations++;
+ }
+ }
+
+ KTEST_LOG(ctx,
+ "checkpoint: reader loop exited (iters=%llu, bytes=%zu); "
+ "draining callout",
+ (unsigned long long)read_iterations, total_bytes_read);
+
+ callout_drain(&stop_timer);
+
+ KTEST_LOG(ctx, "checkpoint: callout drained, waiting for writers");
+
+ for (i = 0; i < LFRW_NUM_WRITERS; i++) {
+ while (atomic_load_acq_32(
+ (volatile uint32_t *)&writers[i].exited) == 0)
+ tsleep(&writers[i].exited, 0, "lfrw_ww", hz / 10);
+ KTEST_LOG(ctx, "checkpoint: writer %d exited", i);
+ }
+
+ KTEST_LOG(ctx, "checkpoint: all writers exited, draining buffers");
+
+ /* Drain remaining */
+ do {
+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
+ read_buf_size, 0);
+ total_bytes_read += read_bytes;
+ } while (read_bytes > 0);
+
+ KTEST_LOG(ctx, "checkpoint: drain complete, gathering stats");
+
+ eventlog_subscriber_get_stats(subscriber, &stats);
+
+ uint64_t total_written = 0;
+ for (i = 0; i < LFRW_NUM_WRITERS; i++)
+ total_written += writers[i].events_written;
+
+ KTEST_LOG(ctx, "Writers: %llu events. Reader: %llu reads, %zu bytes. "
+ "Dropped: %llu",
+ (unsigned long long)total_written,
+ (unsigned long long)read_iterations,
+ total_bytes_read,
+ (unsigned long long)stats.dropped_events);
+
+ KTEST_VERIFY(total_written > 0);
+ KTEST_VERIFY(total_bytes_read > 0);
+ KTEST_VERIFY(read_iterations > 0);
+
+ KTEST_LOG(ctx, "checkpoint: tearing down session/subscriber/provider");
+
+ eventlog_session_destroy(session);
+ eventlog_subscriber_destroy(subscriber);
+ eventlog_provider_destroy(provider);
+ free(read_buf, M_EVENTLOG_TEST);
+
+ KTEST_LOG(ctx, "checkpoint: teardown complete");
+
+ return (0);
+#undef LFRW_NUM_WRITERS
+#undef LFRW_RUN_SECONDS
+}
+
+/*
+ * Test: timestamp epoch boundary defers future-timestamped events.
+ * Writes events with known timestamps, some well in the past and one far
+ * in the future. Verifies only past events are delivered and the future
+ * event is deferred.
+ */
+KTEST_FUNC(timestamp_epoch_boundary)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct eventlog_subscriber *subscriber;
+ char read_buf[8 * 1024];
+ size_t read_bytes;
+ uint32_t payload;
+ int i, event_count;
+
+ KTEST_LOG(ctx, "Testing timestamp epoch boundary deferral");
+
+ provider = test_create_provider("test_ts_epoch", NULL, NULL);
+ KTEST_NEQUAL(provider, NULL);
+ subscriber = test_enable_provider_device("test_ts_epoch",
+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF);
+ KTEST_NEQUAL(subscriber, NULL);
+ session = eventlog_session_create(provider, 0, true, NULL, 0);
+ KTEST_NEQUAL(session, NULL);
+
+ /* Drain SESSION_CREATE */
+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
+ sizeof(read_buf), 0);
+ KTEST_VERIFY(read_bytes > 0);
+
+ /* Write 5 events with timestamps well in the past (1-5 microseconds) */
+ for (i = 0; i < 5; i++) {
+ payload = (uint32_t)(i + 1);
+ eventlog_event_write_at(session, 100 + i, EVENTLOG_LEVEL_INFO,
+ 0xFFFFFFFF, &payload, sizeof(payload),
+ (uint64_t)(i + 1));
+ }
+
+ /* Write 1 event with a far-future timestamp */
+ payload = 0xFFFF;
+ eventlog_event_write_at(session, 200, EVENTLOG_LEVEL_INFO,
+ 0xFFFFFFFF, &payload, sizeof(payload),
+ UINT64_MAX - 1000);
+
+ /* Read: should get exactly the 5 past events */
+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
+ sizeof(read_buf), 0);
+ KTEST_VERIFY(read_bytes > 0);
+
+ /* Count events and verify timestamps are all in the past */
+ event_count = 0;
+ {
+ size_t offset = 0;
+ while (offset + sizeof(struct eventlog_event_header) <=
+ read_bytes) {
+ struct eventlog_event_header hdr;
+ memcpy(&hdr, read_buf + offset, sizeof(hdr));
+ if (hdr.event_length <
+ sizeof(struct eventlog_event_header) ||
+ offset + hdr.event_length > read_bytes)
+ break;
+ KTEST_VERIFY(hdr.timestamp < UINT64_MAX - 1000);
+ event_count++;
+ offset += hdr.event_length;
+ }
+ }
+ KTEST_EQUAL(event_count, 5);
+
+ /* Second read (non-blocking): future deferred, nothing readable */
+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
+ sizeof(read_buf), FNONBLOCK);
+ KTEST_EQUAL(read_bytes, 0);
+
+ eventlog_session_destroy(session);
+ eventlog_subscriber_destroy(subscriber);
+ eventlog_provider_destroy(provider);
+
+ return (0);
+}
+
+/*
+ * Test: normal events (real timestamps) are unaffected by epoch boundary.
+ * Writes events with real binuptime timestamps and verifies all are delivered.
+ */
+KTEST_FUNC(timestamp_epoch_normal_delivery)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct eventlog_subscriber *subscriber;
+ char read_buf[8 * 1024];
+ size_t read_bytes;
+ uint32_t payload;
+ int i, event_count;
+
+ KTEST_LOG(ctx, "Testing that normal events pass epoch boundary");
+
+ provider = test_create_provider("test_ts_normal", NULL, NULL);
+ KTEST_NEQUAL(provider, NULL);
+ subscriber = test_enable_provider_device("test_ts_normal",
+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF);
+ KTEST_NEQUAL(subscriber, NULL);
+ session = eventlog_session_create(provider, 0, true, NULL, 0);
+ KTEST_NEQUAL(session, NULL);
+
+ /* Drain SESSION_CREATE */
+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
+ sizeof(read_buf), 0);
+ KTEST_VERIFY(read_bytes > 0);
+
+ /* Write 10 events with real timestamps */
+ for (i = 0; i < 10; i++) {
+ payload = (uint32_t)(i + 1);
+ eventlog_event_write(session, 100 + i, EVENTLOG_LEVEL_INFO,
+ 0xFFFFFFFF, &payload, sizeof(payload));
+ }
+
+ /* Read: should get all 10 events */
+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
+ sizeof(read_buf), 0);
+ KTEST_VERIFY(read_bytes > 0);
+
+ event_count = 0;
+ {
+ size_t offset = 0;
+ while (offset + sizeof(struct eventlog_event_header) <=
+ read_bytes) {
+ struct eventlog_event_header hdr;
+ memcpy(&hdr, read_buf + offset, sizeof(hdr));
+ if (hdr.event_length <
+ sizeof(struct eventlog_event_header) ||
+ offset + hdr.event_length > read_bytes)
+ break;
+ event_count++;
+ offset += hdr.event_length;
+ }
+ }
+ KTEST_EQUAL(event_count, 10);
+
+ /* Buffer should be empty now */
+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
+ sizeof(read_buf), FNONBLOCK);
+ KTEST_EQUAL(read_bytes, 0);
+
+ eventlog_session_destroy(session);
+ eventlog_subscriber_destroy(subscriber);
+ eventlog_provider_destroy(provider);
+
+ return (0);
+}
+
+/*
+ * Test: small uio buffer with epoch boundary requires multiple reads.
+ * Uses a uio buffer that fits only 2 events per read. Writes past and
+ * future events. Verifies past events are delivered across multiple reads
+ * and future events are never delivered.
+ */
+KTEST_FUNC(timestamp_epoch_small_uio)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct eventlog_subscriber *subscriber;
+ size_t read_bytes;
+ uint32_t payload;
+ int i, total_events;
+
+ KTEST_LOG(ctx, "Testing epoch boundary with small uio buffer");
+
+ provider = test_create_provider("test_ts_small_uio", NULL, NULL);
+ KTEST_NEQUAL(provider, NULL);
+ subscriber = test_enable_provider_device("test_ts_small_uio",
+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF);
+ KTEST_NEQUAL(subscriber, NULL);
+ session = eventlog_session_create(provider, 0, true, NULL, 0);
+ KTEST_NEQUAL(session, NULL);
+
+ /* Drain SESSION_CREATE with a large buffer */
+ {
+ char drain_buf[4096];
+ read_bytes = eventlog_read_into_buf(subscriber, drain_buf,
+ sizeof(drain_buf), 0);
+ KTEST_VERIFY(read_bytes > 0);
+ }
+
+ /* Write 6 events with past timestamps, then 2 with future */
+ for (i = 0; i < 6; i++) {
+ payload = (uint32_t)(i + 1);
+ eventlog_event_write_at(session, 100 + i, EVENTLOG_LEVEL_INFO,
+ 0xFFFFFFFF, &payload, sizeof(payload),
+ (uint64_t)(1000 + i));
+ }
+ for (i = 0; i < 2; i++) {
+ payload = (uint32_t)(100 + i);
+ eventlog_event_write_at(session, 200 + i, EVENTLOG_LEVEL_INFO,
+ 0xFFFFFFFF, &payload, sizeof(payload),
+ UINT64_MAX - (uint64_t)(2000 - i));
+ }
+
+ /* Read with a buffer that fits ~2 events at a time */
+ total_events = 0;
+ {
+ char small_buf[2 * (sizeof(struct eventlog_event_header) +
+ sizeof(uint32_t)) + 64];
+
+ for (i = 0; i < 10; i++) {
+ size_t offset;
+ read_bytes = eventlog_read_into_buf(subscriber,
+ small_buf, sizeof(small_buf), FNONBLOCK);
+ if (read_bytes == 0)
+ break;
+ offset = 0;
+ while (offset + sizeof(struct eventlog_event_header) <=
+ read_bytes) {
+ struct eventlog_event_header hdr;
+ memcpy(&hdr, small_buf + offset, sizeof(hdr));
+ if (hdr.event_length <
+ sizeof(struct eventlog_event_header) ||
+ offset + hdr.event_length > read_bytes)
+ break;
+ KTEST_VERIFY(
+ hdr.timestamp < UINT64_MAX - 10000);
+ total_events++;
+ offset += hdr.event_length;
+ }
+ }
+ }
+
+ /* Should have read exactly the 6 past events */
+ KTEST_EQUAL(total_events, 6);
+
+ eventlog_session_destroy(session);
+ eventlog_subscriber_destroy(subscriber);
+ eventlog_provider_destroy(provider);
+
+ return (0);
+}
+
+/* Dump state test infrastructure */
+static volatile uint32_t dump_callback_invocations;
+static struct eventlog_session *dump_test_sessions[4];
+static int dump_test_session_count;
+
+static void
+test_dump_callback(struct eventlog_provider *provider, void *arg)
+{
+ int i;
+
+ atomic_add_int(&dump_callback_invocations, 1);
+ for (i = 0; i < dump_test_session_count; i++) {
+ if (dump_test_sessions[i] != NULL &&
+ dump_test_sessions[i]->effective_level >=
+ EVENTLOG_LEVEL_INFO) {
+ uint32_t data = 0xdead0000 | i;
+ eventlog_event_write(dump_test_sessions[i], 0x100 + i,
+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF,
+ &data, sizeof(data));
+ }
+ }
+}
+
+/*
+ * Verify dump callback is invoked and events arrive at subscriber.
+ */
+KTEST_FUNC(dump_state_basic)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct eventlog_subscriber *subscriber;
+ struct test_callback_data *callback_data;
+
+ KTEST_LOG(ctx, "Testing dump state basic functionality");
+
+ dump_callback_invocations = 0;
+ dump_test_session_count = 1;
+
+ provider = test_create_provider("test_ds_basic", test_dump_callback,
+ NULL);
+ KTEST_NEQUAL(provider, NULL);
+
+ session = eventlog_session_create(provider, 1, true, NULL, 0);
+ KTEST_NEQUAL(session, NULL);
+ dump_test_sessions[0] = session;
+
+ callback_data = malloc(sizeof(*callback_data), M_EVENTLOG_TEST,
+ M_WAITOK | M_ZERO);
+ mtx_init(&callback_data->lock, "test_ds_basic", NULL, MTX_DEF);
+ subscriber = eventlog_subscriber_create_callback(test_event_callback,
+ callback_data);
+ KTEST_NEQUAL(subscriber, NULL);
+
+ KTEST_EQUAL(eventlog_subscriber_add_subscription(subscriber,
+ "test_ds_basic", EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF,
+ EVENTLOG_SUBSCRIPTION_DUMP_STATE), 0);
+
+ /*
+ * dump_state runs on a private taskqueue; drain before reading
+ * the observation counters so we deterministically see the
+ * post-dump state and not the in-flight state.
+ */
+ eventlog_subscriber_drain_dumps(subscriber);
+
+ KTEST_EQUAL(atomic_load_acq_32(&dump_callback_invocations), 1);
+ /* SESSION_CREATE from session_create + 1 dump event */
+ KTEST_VERIFY(atomic_load_acq_32(&callback_data->event_count) >= 1);
+
+ dump_test_sessions[0] = NULL;
+ eventlog_session_destroy(session);
+ eventlog_subscriber_destroy(subscriber);
+ mtx_destroy(&callback_data->lock);
+ free(callback_data, M_EVENTLOG_TEST);
+ eventlog_provider_destroy(provider);
+
+ return (0);
+}
+
+/*
+ * Verify dump events go only to the requesting subscriber, not others.
+ */
+KTEST_FUNC(dump_state_routing)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct eventlog_subscriber *sub1, *sub2;
+ struct test_callback_data *cd1, *cd2;
+ uint32_t sub1_count_before;
+
+ KTEST_LOG(ctx, "Testing dump state routing to single subscriber");
+
+ dump_callback_invocations = 0;
+ dump_test_session_count = 1;
+
+ provider = test_create_provider("test_ds_route", test_dump_callback,
+ NULL);
+ KTEST_NEQUAL(provider, NULL);
+
+ session = eventlog_session_create(provider, 1, true, NULL, 0);
+ KTEST_NEQUAL(session, NULL);
+ dump_test_sessions[0] = session;
+
+ /*
+ * sub1: subscribes first. Its own dump runs immediately and produces
+ * one event; drain so the count we capture next is stable.
+ */
+ cd1 = malloc(sizeof(*cd1), M_EVENTLOG_TEST, M_WAITOK | M_ZERO);
+ mtx_init(&cd1->lock, "test_ds_route1", NULL, MTX_DEF);
+ sub1 = eventlog_subscriber_create_callback(test_event_callback, cd1);
+ KTEST_NEQUAL(sub1, NULL);
+ KTEST_EQUAL(eventlog_subscriber_add_subscription(sub1, "test_ds_route",
+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF,
+ EVENTLOG_SUBSCRIPTION_DUMP_STATE), 0);
+ eventlog_subscriber_drain_dumps(sub1);
+
+ sub1_count_before = atomic_load_acq_32(&cd1->event_count);
+
+ /*
+ * sub2 subscribes second. Its dump must be routed only to sub2 --
+ * sub1's count must not change.
+ */
+ cd2 = malloc(sizeof(*cd2), M_EVENTLOG_TEST, M_WAITOK | M_ZERO);
+ mtx_init(&cd2->lock, "test_ds_route2", NULL, MTX_DEF);
+ sub2 = eventlog_subscriber_create_callback(test_event_callback, cd2);
+ KTEST_NEQUAL(sub2, NULL);
+ KTEST_EQUAL(eventlog_subscriber_add_subscription(sub2, "test_ds_route",
+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF,
+ EVENTLOG_SUBSCRIPTION_DUMP_STATE), 0);
+ eventlog_subscriber_drain_dumps(sub2);
+
+ /* sub2 should have received the dump event */
+ KTEST_VERIFY(atomic_load_acq_32(&cd2->event_count) >= 1);
+ /* sub1 should NOT have received any additional events from the dump */
+ KTEST_EQUAL(atomic_load_acq_32(&cd1->event_count), sub1_count_before);
+
+ dump_test_sessions[0] = NULL;
+ eventlog_session_destroy(session);
+ eventlog_subscriber_destroy(sub1);
+ eventlog_subscriber_destroy(sub2);
+ mtx_destroy(&cd1->lock);
+ mtx_destroy(&cd2->lock);
+ free(cd1, M_EVENTLOG_TEST);
+ free(cd2, M_EVENTLOG_TEST);
+ eventlog_provider_destroy(provider);
+
+ return (0);
+}
+
+/*
+ * Verify DUMP_STATE with NULL callback is a graceful no-op.
+ */
+KTEST_FUNC(dump_state_no_callback)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct eventlog_subscriber *subscriber;
+ struct test_callback_data *callback_data;
+
+ KTEST_LOG(ctx, "Testing dump state with no callback (graceful no-op)");
+
+ provider = test_create_provider("test_ds_nocb", NULL, NULL);
+ KTEST_NEQUAL(provider, NULL);
+
+ session = eventlog_session_create(provider, 1, true, NULL, 0);
+ KTEST_NEQUAL(session, NULL);
+
+ callback_data = malloc(sizeof(*callback_data), M_EVENTLOG_TEST,
+ M_WAITOK | M_ZERO);
+ mtx_init(&callback_data->lock, "test_ds_nocb", NULL, MTX_DEF);
+ subscriber = eventlog_subscriber_create_callback(test_event_callback,
+ callback_data);
+ KTEST_NEQUAL(subscriber, NULL);
+
+ /* Should succeed without crash even though no dump callback */
+ KTEST_EQUAL(eventlog_subscriber_add_subscription(subscriber,
+ "test_ds_nocb", EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF,
+ EVENTLOG_SUBSCRIPTION_DUMP_STATE), 0);
+
+ /*
+ * No dump task should have been enqueued because the provider has
+ * no dump_callback. drain_dumps still has to be a no-op in that
+ * case (dump_pending stays at 0); call it explicitly to pin that
+ * contract.
+ */
+ eventlog_subscriber_drain_dumps(subscriber);
+
+ /* No dump events; only SESSION_CREATE may be counted */
+ KTEST_VERIFY(atomic_load_acq_32(&callback_data->event_count) <= 1);
+
+ eventlog_session_destroy(session);
+ eventlog_subscriber_destroy(subscriber);
+ mtx_destroy(&callback_data->lock);
+ free(callback_data, M_EVENTLOG_TEST);
+ eventlog_provider_destroy(provider);
+
+ return (0);
+}
+
+/*
+ * Captures curthread->td_vnet (as uintptr_t to keep this file free of any
+ * struct-vnet dependency) observed at dump_callback invocation time.
+ */
+static volatile uintptr_t dump_observed_td_vnet;
+static volatile bool dump_observed_set;
+
+static void
+test_dump_callback_capture_td_vnet(struct eventlog_provider *provider __unused,
+ void *arg __unused)
+{
+ dump_observed_td_vnet = (uintptr_t)curthread->td_vnet;
+ dump_observed_set = true;
+}
+
+/*
+ * Regression test for NCD-9675.
+ *
+ * Pins down the framework contract that motivated the TCP fix: the eventlog
+ * machinery invokes provider->dump_callback without setting curvnet.
+ * Providers that touch per-vnet state must iterate vnets / set curvnet
+ * themselves. The dump runs on a kernel taskqueue thread whose
+ * td_vnet is NULL; the test subscribes, drains, and verifies the
+ * callback's observed context.
+ *
+ * If a future change makes the framework set curvnet around the dump
+ * callback, this test will fail and the change should be deliberate (and
+ * accompanied by removing the per-provider VNET_FOREACH wrappers).
+ */
+KTEST_FUNC(dump_state_curvnet_not_set)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_subscriber *subscriber;
+ struct test_callback_data *callback_data;
+ int ret;
+
+ KTEST_LOG(ctx, "Verifying dump_callback runs with curvnet unset");
+
+ dump_observed_td_vnet = (uintptr_t)0x1;
+ dump_observed_set = false;
+
+ provider = test_create_provider("test_ds_curvnet",
+ test_dump_callback_capture_td_vnet, NULL);
+ KTEST_NEQUAL(provider, NULL);
+
+ callback_data = malloc(sizeof(*callback_data), M_EVENTLOG_TEST,
+ M_WAITOK | M_ZERO);
+ mtx_init(&callback_data->lock, "test_ds_curvnet", NULL, MTX_DEF);
+ subscriber = eventlog_subscriber_create_callback(test_event_callback,
+ callback_data);
+ KTEST_NEQUAL(subscriber, NULL);
+
+ ret = eventlog_subscriber_add_subscription(subscriber,
+ "test_ds_curvnet", EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF,
+ EVENTLOG_SUBSCRIPTION_DUMP_STATE);
+ KTEST_EQUAL(ret, 0);
+
+ /* Wait for the async dump task to finish before reading observed. */
+ eventlog_subscriber_drain_dumps(subscriber);
+
+ KTEST_VERIFY(dump_observed_set);
+ KTEST_VERIFY(dump_observed_td_vnet == 0);
+
+ eventlog_subscriber_destroy(subscriber);
+ mtx_destroy(&callback_data->lock);
+ free(callback_data, M_EVENTLOG_TEST);
+ eventlog_provider_destroy(provider);
+
+ return (0);
+}
+
+/*
+ * Verify _ENABLED macros skip disabled sessions during dump.
+ */
+KTEST_FUNC(dump_state_disabled_sessions)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_session *enabled_session, *disabled_session;
+ struct eventlog_subscriber *subscriber;
+ struct test_callback_data *callback_data;
+
+ KTEST_LOG(ctx, "Testing dump state skips disabled sessions");
+
+ dump_callback_invocations = 0;
+ dump_test_session_count = 2;
+
+ provider = test_create_provider("test_ds_dis", test_dump_callback,
+ NULL);
+ KTEST_NEQUAL(provider, NULL);
+
+ enabled_session = eventlog_session_create(provider, 1, true, NULL, 0);
+ KTEST_NEQUAL(enabled_session, NULL);
+ dump_test_sessions[0] = enabled_session;
+
+ disabled_session = eventlog_session_create(provider, 2, true, NULL, 0);
+ KTEST_NEQUAL(disabled_session, NULL);
+ eventlog_session_set_enabled(disabled_session, 0);
+ dump_test_sessions[1] = disabled_session;
+
+ callback_data = malloc(sizeof(*callback_data), M_EVENTLOG_TEST,
+ M_WAITOK | M_ZERO);
+ mtx_init(&callback_data->lock, "test_ds_dis", NULL, MTX_DEF);
+ subscriber = eventlog_subscriber_create_callback(test_event_callback,
+ callback_data);
+ KTEST_NEQUAL(subscriber, NULL);
+ KTEST_EQUAL(eventlog_subscriber_add_subscription(subscriber,
+ "test_ds_dis", EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF,
+ EVENTLOG_SUBSCRIPTION_DUMP_STATE), 0);
+
+ eventlog_subscriber_drain_dumps(subscriber);
+
+ KTEST_EQUAL(atomic_load_acq_32(&dump_callback_invocations), 1);
+ /*
+ * The dump callback writes to both sessions, but the disabled session's
+ * effective_level is NONE so eventlog_event_write_impl's subscriber
+ * filtering will drop those events. Only the enabled session's events
+ * should arrive. We expect: SESSION_CREATE (from create) + 1 dump
+ * event for the enabled session = at least 1 from the dump.
+ */
+ {
+ uint32_t ec = atomic_load_acq_32(&callback_data->event_count);
+ KTEST_LOG(ctx, "Received %u events (enabled+dump)", ec);
+ KTEST_VERIFY(ec >= 1);
+ }
+
+ dump_test_sessions[0] = NULL;
+ dump_test_sessions[1] = NULL;
+ eventlog_session_destroy(enabled_session);
+ eventlog_session_destroy(disabled_session);
+ eventlog_subscriber_destroy(subscriber);
+ mtx_destroy(&callback_data->lock);
+ free(callback_data, M_EVENTLOG_TEST);
+ eventlog_provider_destroy(provider);
+
+ return (0);
+}
+
+/*
+ * Async dump_state contract: the callback does not run on the
+ * subscribing thread, subscribe returns before the dump finishes,
+ * drain_dumps() / destroy() are the sync points, and re-subscribing
+ * does not re-fire the dump. Shared scratch for the tests below.
+ */
+static volatile struct thread *async_dump_thread;
+static volatile bool async_dump_observed;
+static struct mtx async_dump_mtx;
+static struct cv async_dump_cv;
+static volatile bool async_dump_release;
+static volatile uint32_t async_dump_runs;
+
+static void
+async_dump_callback_record_thread(struct eventlog_provider *provider __unused,
+ void *arg __unused)
+{
+ async_dump_thread = curthread;
+ async_dump_observed = true;
+ atomic_add_32(&async_dump_runs, 1);
+}
+
+/*
+ * Slow dump_callback: blocks until the test releases it via
+ * async_dump_release. Used to put the dump task into a known
+ * "in-flight" state so the test can race destroy / drain against it.
+ */
+static void
+async_dump_callback_block(struct eventlog_provider *provider __unused,
+ void *arg __unused)
+{
+ mtx_lock(&async_dump_mtx);
+ atomic_add_32(&async_dump_runs, 1);
+ while (!async_dump_release)
+ cv_wait(&async_dump_cv, &async_dump_mtx);
+ mtx_unlock(&async_dump_mtx);
+}
+
+/*
+ * Verifies dump_callback runs on a thread different from the subscriber's
+ * own thread (i.e. the framework taskqueue thread).
+ */
+KTEST_FUNC(dump_state_async_runs_off_caller_thread)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct eventlog_subscriber *subscriber;
+ struct test_callback_data *cd;
+
+ KTEST_LOG(ctx, "Verifying dump_callback runs on a different thread");
+
+ async_dump_thread = NULL;
+ async_dump_observed = false;
+ atomic_store_rel_32(&async_dump_runs, 0);
+
+ provider = test_create_provider("test_ds_async_thr",
+ async_dump_callback_record_thread, NULL);
+ KTEST_NEQUAL(provider, NULL);
+
+ session = eventlog_session_create(provider, 1, true, NULL, 0);
+ KTEST_NEQUAL(session, NULL);
+
+ cd = malloc(sizeof(*cd), M_EVENTLOG_TEST, M_WAITOK | M_ZERO);
+ mtx_init(&cd->lock, "test_ds_async_thr", NULL, MTX_DEF);
+ subscriber = eventlog_subscriber_create_callback(test_event_callback,
+ cd);
+ KTEST_NEQUAL(subscriber, NULL);
+
+ KTEST_EQUAL(eventlog_subscriber_add_subscription(subscriber,
+ "test_ds_async_thr", EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF,
+ EVENTLOG_SUBSCRIPTION_DUMP_STATE), 0);
+
+ eventlog_subscriber_drain_dumps(subscriber);
+
+ KTEST_VERIFY(async_dump_observed);
+ KTEST_VERIFY(async_dump_thread != NULL);
+ KTEST_VERIFY(async_dump_thread != curthread);
+ KTEST_EQUAL(atomic_load_acq_32(&async_dump_runs), 1);
+
+ eventlog_session_destroy(session);
+ eventlog_subscriber_destroy(subscriber);
+ mtx_destroy(&cd->lock);
+ free(cd, M_EVENTLOG_TEST);
+ eventlog_provider_destroy(provider);
+
+ return (0);
+}
+
+/*
+ * Verifies subscribe returns before a slow dump_callback finishes,
+ * so providers can do expensive dump work without blocking the caller.
+ */
+KTEST_FUNC(dump_state_async_subscribe_returns_before_dump)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct eventlog_subscriber *subscriber;
+ struct test_callback_data *cd;
+
+ KTEST_LOG(ctx, "Verifying subscribe returns before dump completes");
+
+ atomic_store_rel_32(&async_dump_runs, 0);
+ mtx_init(&async_dump_mtx, "async_dump_mtx", NULL, MTX_DEF);
+ cv_init(&async_dump_cv, "async_dump_cv");
+ async_dump_release = false;
+
+ provider = test_create_provider("test_ds_async_block",
+ async_dump_callback_block, NULL);
+ KTEST_NEQUAL(provider, NULL);
+
+ session = eventlog_session_create(provider, 1, true, NULL, 0);
+ KTEST_NEQUAL(session, NULL);
+
+ cd = malloc(sizeof(*cd), M_EVENTLOG_TEST, M_WAITOK | M_ZERO);
+ mtx_init(&cd->lock, "test_ds_async_block", NULL, MTX_DEF);
+ subscriber = eventlog_subscriber_create_callback(test_event_callback,
+ cd);
+ KTEST_NEQUAL(subscriber, NULL);
+
+ /*
+ * Subscribe enqueues a dump that will block in the callback. The
+ * call must return promptly even though the dump is parked --
+ * that's the whole point of the rework.
+ */
+ KTEST_EQUAL(eventlog_subscriber_add_subscription(subscriber,
+ "test_ds_async_block", EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF,
+ EVENTLOG_SUBSCRIPTION_DUMP_STATE), 0);
+
+ /* Release the dump so it can complete and decrement dump_pending. */
+ mtx_lock(&async_dump_mtx);
+ async_dump_release = true;
+ cv_broadcast(&async_dump_cv);
+ mtx_unlock(&async_dump_mtx);
+
+ eventlog_subscriber_drain_dumps(subscriber);
+
+ KTEST_EQUAL(atomic_load_acq_32(&async_dump_runs), 1);
+
+ eventlog_session_destroy(session);
+ eventlog_subscriber_destroy(subscriber);
+ mtx_destroy(&cd->lock);
+ free(cd, M_EVENTLOG_TEST);
+ eventlog_provider_destroy(provider);
+ cv_destroy(&async_dump_cv);
+ mtx_destroy(&async_dump_mtx);
+
+ return (0);
+}
+
+/*
+ * Verifies eventlog_subscriber_destroy() implicitly drains pending
+ * dump tasks rather than freeing memory out from under them. We
+ * subscribe with a callback that blocks, kick off destroy in a
+ * thread that then unblocks the dump, and confirm destroy waits
+ * for it.
+ */
+
+struct destroy_drain_thread_arg {
+ struct eventlog_subscriber *subscriber;
+ volatile bool started;
+ volatile bool returned;
+};
+
+static void
+destroy_drain_thread(void *arg)
+{
+ struct destroy_drain_thread_arg *a = arg;
+
+ a->started = true;
+ eventlog_subscriber_destroy(a->subscriber);
+ a->returned = true;
+ kthread_exit();
+}
+
+KTEST_FUNC(dump_state_destroy_waits_for_dump)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct eventlog_subscriber *subscriber;
+ struct test_callback_data *cd;
+ struct destroy_drain_thread_arg arg;
+ struct thread *td;
+ int i;
+
+ KTEST_LOG(ctx, "Verifying destroy() drains in-flight dumps");
+
+ atomic_store_rel_32(&async_dump_runs, 0);
+ mtx_init(&async_dump_mtx, "async_dump_mtx", NULL, MTX_DEF);
+ cv_init(&async_dump_cv, "async_dump_cv");
+ async_dump_release = false;
+
+ provider = test_create_provider("test_ds_destroy_drain",
+ async_dump_callback_block, NULL);
+ KTEST_NEQUAL(provider, NULL);
+
+ session = eventlog_session_create(provider, 1, true, NULL, 0);
+ KTEST_NEQUAL(session, NULL);
+
+ cd = malloc(sizeof(*cd), M_EVENTLOG_TEST, M_WAITOK | M_ZERO);
+ mtx_init(&cd->lock, "test_ds_destroy_drain", NULL, MTX_DEF);
+ subscriber = eventlog_subscriber_create_callback(test_event_callback,
+ cd);
+ KTEST_NEQUAL(subscriber, NULL);
+
+ KTEST_EQUAL(eventlog_subscriber_add_subscription(subscriber,
+ "test_ds_destroy_drain", EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF,
+ EVENTLOG_SUBSCRIPTION_DUMP_STATE), 0);
+
+ /* Wait for the dump task to actually start running (and block). */
+ for (i = 0; i < 1000; i++) {
+ if (atomic_load_acq_32(&async_dump_runs) == 1)
+ break;
+ pause("ds_run", 1);
+ }
+ KTEST_EQUAL(atomic_load_acq_32(&async_dump_runs), 1);
+
+ /*
+ * Spawn a thread that calls destroy(). It must NOT return until
+ * we release the dump callback below. We give it 100ms to prove
+ * it's stuck waiting on dump_pending, then release the callback
+ * and wait for destroy() to complete.
+ */
+ memset(&arg, 0, sizeof(arg));
+ arg.subscriber = subscriber;
+ KTEST_EQUAL(kthread_add(destroy_drain_thread, &arg, NULL, &td, 0, 0,
+ "evl_ds_destroy_drain"), 0);
+
+ /* Wait for the destroy thread to start. */
+ for (i = 0; i < 1000; i++) {
+ if (arg.started)
+ break;
+ pause("ds_strt", 1);
+ }
+ KTEST_VERIFY(arg.started);
+
+ /*
+ * Confirm destroy() is parked on dump_pending. If it had freed
+ * the subscriber already, async_dump_callback_block (which is
+ * still parked on the cv) would also have freed its mtx, and we
+ * would have crashed. The fact that arg.returned is still false
+ * after a generous wait is the signal.
+ */
+ pause("ds_park", hz / 10);
+ KTEST_VERIFY(!arg.returned);
+
+ /* Release the dump and wait for destroy() to come back. */
+ mtx_lock(&async_dump_mtx);
+ async_dump_release = true;
+ cv_broadcast(&async_dump_cv);
+ mtx_unlock(&async_dump_mtx);
+
+ for (i = 0; i < 1000; i++) {
+ if (arg.returned)
+ break;
+ pause("ds_done", 1);
+ }
+ KTEST_VERIFY(arg.returned);
+
+ eventlog_session_destroy(session);
+ mtx_destroy(&cd->lock);
+ free(cd, M_EVENTLOG_TEST);
+ eventlog_provider_destroy(provider);
+ cv_destroy(&async_dump_cv);
+ mtx_destroy(&async_dump_mtx);
+
+ return (0);
+}
+
+/*
+ * Verifies that re-subscribing an already-subscribed (provider, level,
+ * keywords) does not re-fire the dump_callback. The replay is a
+ * one-shot per first-time subscribe; the subscriber already has the
+ * state from the original subscribe.
+ */
+KTEST_FUNC(dump_state_resubscribe_no_refire)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct eventlog_subscriber *subscriber;
+ struct test_callback_data *cd;
+
+ KTEST_LOG(ctx, "Verifying re-subscribe does not re-fire dump");
+
+ async_dump_thread = NULL;
+ async_dump_observed = false;
+ atomic_store_rel_32(&async_dump_runs, 0);
+
+ provider = test_create_provider("test_ds_resub",
+ async_dump_callback_record_thread, NULL);
+ KTEST_NEQUAL(provider, NULL);
+
+ session = eventlog_session_create(provider, 1, true, NULL, 0);
+ KTEST_NEQUAL(session, NULL);
+
+ cd = malloc(sizeof(*cd), M_EVENTLOG_TEST, M_WAITOK | M_ZERO);
+ mtx_init(&cd->lock, "test_ds_resub", NULL, MTX_DEF);
+ subscriber = eventlog_subscriber_create_callback(test_event_callback,
+ cd);
+ KTEST_NEQUAL(subscriber, NULL);
+
+ KTEST_EQUAL(eventlog_subscriber_add_subscription(subscriber,
+ "test_ds_resub", EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF,
+ EVENTLOG_SUBSCRIPTION_DUMP_STATE), 0);
+ eventlog_subscriber_drain_dumps(subscriber);
+ KTEST_EQUAL(atomic_load_acq_32(&async_dump_runs), 1);
+
+ /* Re-subscribe with different level/keywords -- update in place. */
+ KTEST_EQUAL(eventlog_subscriber_add_subscription(subscriber,
+ "test_ds_resub", EVENTLOG_LEVEL_INFO, 0xF0F0F0F0,
+ EVENTLOG_SUBSCRIPTION_DUMP_STATE), 0);
+ eventlog_subscriber_drain_dumps(subscriber);
+ KTEST_EQUAL(atomic_load_acq_32(&async_dump_runs), 1);
+
+ eventlog_session_destroy(session);
+ eventlog_subscriber_destroy(subscriber);
+ mtx_destroy(&cd->lock);
+ free(cd, M_EVENTLOG_TEST);
+ eventlog_provider_destroy(provider);
+
+ return (0);
+}
+
+/*
+ * Verifies the framework emits an EVENTLOG_DUMP_COMPLETE_ID event to
+ * the requesting subscriber once the dump_callback returns. The
+ * callback intentionally emits no events, so DUMP_COMPLETE is the
+ * only thing the subscriber should see -- we check both event_count
+ * and last_event_id to pin that down.
+ *
+ * Subscribers that did not request EVENTLOG_KEYWORD_SESSION must
+ * NOT receive DUMP_COMPLETE; we verify this with a second subscriber
+ * that subscribes with a non-session keyword mask.
+ */
+KTEST_FUNC(dump_state_emits_dump_complete)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_session *session;
+ struct eventlog_subscriber *with_session, *without_session;
+ struct test_callback_data *cd_with, *cd_without;
+
+ KTEST_LOG(ctx, "Verifying DUMP_COMPLETE emission and keyword filter");
+
+ async_dump_thread = NULL;
+ async_dump_observed = false;
+ atomic_store_rel_32(&async_dump_runs, 0);
+
+ provider = test_create_provider("test_ds_complete",
+ async_dump_callback_record_thread, NULL);
+ KTEST_NEQUAL(provider, NULL);
+
+ session = eventlog_session_create(provider, 1, true, NULL, 0);
+ KTEST_NEQUAL(session, NULL);
+
+ cd_with = malloc(sizeof(*cd_with), M_EVENTLOG_TEST,
+ M_WAITOK | M_ZERO);
+ mtx_init(&cd_with->lock, "test_ds_complete_w", NULL, MTX_DEF);
+ with_session = eventlog_subscriber_create_callback(
+ test_event_callback, cd_with);
+ KTEST_NEQUAL(with_session, NULL);
+
+ cd_without = malloc(sizeof(*cd_without), M_EVENTLOG_TEST,
+ M_WAITOK | M_ZERO);
+ mtx_init(&cd_without->lock, "test_ds_complete_wo", NULL, MTX_DEF);
+ without_session = eventlog_subscriber_create_callback(
+ test_event_callback, cd_without);
+ KTEST_NEQUAL(without_session, NULL);
+
+ /* with_session: full mask -- includes EVENTLOG_KEYWORD_SESSION. */
+ KTEST_EQUAL(eventlog_subscriber_add_subscription(with_session,
+ "test_ds_complete", EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF,
+ EVENTLOG_SUBSCRIPTION_DUMP_STATE), 0);
+ /* without_session: SESSION bit (0x80000000) explicitly cleared. */
+ KTEST_EQUAL(eventlog_subscriber_add_subscription(without_session,
+ "test_ds_complete", EVENTLOG_LEVEL_VERBOSE, 0x7FFFFFFF,
+ EVENTLOG_SUBSCRIPTION_DUMP_STATE), 0);
+
+ eventlog_subscriber_drain_dumps(with_session);
+ eventlog_subscriber_drain_dumps(without_session);
+
+ /* dump_callback ran once for each subscriber */
+ KTEST_EQUAL(atomic_load_acq_32(&async_dump_runs), 2);
+
+ /*
+ * with_session should have received exactly one event --
+ * the synthetic DUMP_COMPLETE. without_session should have
+ * received nothing (SESSION keyword stripped).
+ */
+ KTEST_EQUAL(atomic_load_acq_32(&cd_with->event_count), 1);
+ KTEST_EQUAL(atomic_load_acq_32(&cd_with->last_event_id),
+ EVENTLOG_DUMP_COMPLETE_ID);
+ KTEST_EQUAL(atomic_load_acq_32(&cd_without->event_count), 0);
+
+ eventlog_session_destroy(session);
+ eventlog_subscriber_destroy(with_session);
+ eventlog_subscriber_destroy(without_session);
+ mtx_destroy(&cd_with->lock);
+ free(cd_with, M_EVENTLOG_TEST);
+ mtx_destroy(&cd_without->lock);
+ free(cd_without, M_EVENTLOG_TEST);
+ eventlog_provider_destroy(provider);
+
+ return (0);
+}
+
+/*
+ * Multi-provider callback data: tracks events per provider_id to verify that
+ * events from multiple same-named providers are all delivered.
+ */
+struct multi_provider_callback_data {
+ volatile uint32_t event_count;
+ volatile uint16_t seen_provider_ids[8];
+ volatile uint32_t seen_provider_id_counts[8];
+ volatile int num_distinct_providers;
+};
+
+static void
+multi_provider_callback(const struct eventlog_event_header *hdr,
+ const char *provider_name __unused, uint8_t provider_name_len __unused,
+ uint64_t session_id __unused,
+ const struct iovec *iov __unused, int iovcnt __unused,
+ size_t payload_size __unused, void *callback_arg)
+{
+ struct multi_provider_callback_data *data = callback_arg;
+ int i, n;
+
+ atomic_add_int(&data->event_count, 1);
+
+ n = atomic_load_acq_int(&data->num_distinct_providers);
+ for (i = 0; i < n; i++) {
+ if (data->seen_provider_ids[i] == hdr->provider_id) {
+ atomic_add_int(&data->seen_provider_id_counts[i], 1);
+ return;
+ }
+ }
+ /* New provider_id - add it (racy but fine for small test counts) */
+ if (n < 8) {
+ data->seen_provider_ids[n] = hdr->provider_id;
+ data->seen_provider_id_counts[n] = 1;
+ atomic_add_rel_int(&data->num_distinct_providers, 1);
+ }
+}
+
+/*
+ * Subscribing by name enables ALL providers with that name.
+ */
+KTEST_FUNC(multi_provider_subscribe_enables_all)
+{
+ struct eventlog_provider *p1, *p2;
+ struct eventlog_subscriber *subscriber;
+ struct multi_provider_callback_data cb_data;
+ int error;
+
+ KTEST_LOG(ctx,
+ "Testing subscribe-by-name enables all matching providers");
+
+ p1 = test_create_provider("test_mp_en", NULL, NULL);
+ KTEST_NEQUAL(p1, NULL);
+ p2 = test_create_provider("test_mp_en", NULL, NULL);
+ KTEST_NEQUAL(p2, NULL);
+
+ /* Both providers should start disabled */
+ KTEST_EQUAL(eventlog_provider_get_level(p1), EVENTLOG_LEVEL_NONE);
+ KTEST_EQUAL(eventlog_provider_get_level(p2), EVENTLOG_LEVEL_NONE);
+
+ memset(&cb_data, 0, sizeof(cb_data));
+ subscriber = eventlog_subscriber_create_callback(
+ multi_provider_callback, &cb_data);
+ KTEST_NEQUAL(subscriber, NULL);
+
+ error = eventlog_subscriber_add_subscription(subscriber, "test_mp_en",
+ EVENTLOG_LEVEL_INFO, 0x7, 0);
+ KTEST_EQUAL(error, 0);
+
+ /* Both providers should now be enabled with the same level/keywords */
+ KTEST_EQUAL(eventlog_provider_get_level(p1), EVENTLOG_LEVEL_INFO);
+ KTEST_EQUAL(eventlog_provider_get_level(p2), EVENTLOG_LEVEL_INFO);
+ KTEST_EQUAL(eventlog_provider_get_keywords(p1), 0x7);
+ KTEST_EQUAL(eventlog_provider_get_keywords(p2), 0x7);
+
+ /* Destroying subscriber should disable both */
+ eventlog_subscriber_destroy(subscriber);
+ KTEST_EQUAL(eventlog_provider_get_level(p1), EVENTLOG_LEVEL_NONE);
+ KTEST_EQUAL(eventlog_provider_get_level(p2), EVENTLOG_LEVEL_NONE);
+ KTEST_EQUAL(eventlog_provider_get_keywords(p1), 0);
+ KTEST_EQUAL(eventlog_provider_get_keywords(p2), 0);
+
+ eventlog_provider_destroy(p1);
+ eventlog_provider_destroy(p2);
+
+ return (0);
+}
+
+/*
+ * Events from both same-named providers reach a single subscriber,
+ * and they carry distinct provider_ids.
+ */
+KTEST_FUNC(multi_provider_events_from_both)
+{
+ struct eventlog_provider *p1, *p2;
+ struct eventlog_session *s1, *s2;
+ struct eventlog_subscriber *subscriber;
+ struct multi_provider_callback_data cb_data;
+ uint32_t payload = 0xCAFE;
+ int error;
+
+ KTEST_LOG(ctx, "Testing events from both same-named providers");
+
+ p1 = test_create_provider("test_mp_ev", NULL, NULL);
+ KTEST_NEQUAL(p1, NULL);
+ p2 = test_create_provider("test_mp_ev", NULL, NULL);
+ KTEST_NEQUAL(p2, NULL);
+
+ memset(&cb_data, 0, sizeof(cb_data));
+ subscriber = eventlog_subscriber_create_callback(
+ multi_provider_callback, &cb_data);
+ KTEST_NEQUAL(subscriber, NULL);
+ error = eventlog_subscriber_add_subscription(subscriber, "test_mp_ev",
+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, 0);
+ KTEST_EQUAL(error, 0);
+
+ s1 = eventlog_session_create(p1, 100, true, NULL, 0);
+ KTEST_NEQUAL(s1, NULL);
+ s2 = eventlog_session_create(p2, 200, true, NULL, 0);
+ KTEST_NEQUAL(s2, NULL);
+
+ /* Write events from each provider */
+ eventlog_event_write(s1, 0x1001, EVENTLOG_LEVEL_INFO, 0xFFFFFFFF,
+ &payload, sizeof(payload));
+ eventlog_event_write(s2, 0x2001, EVENTLOG_LEVEL_INFO, 0xFFFFFFFF,
+ &payload, sizeof(payload));
+ eventlog_event_write(s1, 0x1002, EVENTLOG_LEVEL_INFO, 0xFFFFFFFF,
+ &payload, sizeof(payload));
+ eventlog_event_write(s2, 0x2002, EVENTLOG_LEVEL_INFO, 0xFFFFFFFF,
+ &payload, sizeof(payload));
+
+ /* 2 SESSION_CREATEs + 4 user events = 6 total */
+ KTEST_EQUAL(atomic_load_acq_32(&cb_data.event_count), 6);
+
+ /* Events should have come from 2 distinct provider_ids */
+ KTEST_EQUAL(atomic_load_acq_int(&cb_data.num_distinct_providers), 2);
+ /* Each provider sent 3 events (1 SESSION_CREATE + 2 user) */
+ KTEST_EQUAL(atomic_load_acq_32(&cb_data.seen_provider_id_counts[0]), 3);
+ KTEST_EQUAL(atomic_load_acq_32(&cb_data.seen_provider_id_counts[1]), 3);
+
+ eventlog_session_destroy(s1);
+ eventlog_session_destroy(s2);
+ eventlog_subscriber_destroy(subscriber);
+ eventlog_provider_destroy(p1);
+ eventlog_provider_destroy(p2);
+
+ return (0);
+}
+
+/*
+ * Destroying one same-named provider doesn't affect the other.
+ * Subscription and event delivery continue for the surviving provider.
+ */
+KTEST_FUNC(multi_provider_destroy_one)
+{
+ struct eventlog_provider *p1, *p2;
+ struct eventlog_session *s1, *s2;
+ struct eventlog_subscriber *subscriber;
+ struct multi_provider_callback_data cb_data;
+ uint32_t payload = 0xBEEF;
+ uint32_t count_before;
+ int error;
+
+ KTEST_LOG(ctx, "Testing destroy one of two same-named providers");
+
+ p1 = test_create_provider("test_mp_d1", NULL, NULL);
+ KTEST_NEQUAL(p1, NULL);
+ p2 = test_create_provider("test_mp_d1", NULL, NULL);
+ KTEST_NEQUAL(p2, NULL);
+
+ memset(&cb_data, 0, sizeof(cb_data));
+ subscriber = eventlog_subscriber_create_callback(
+ multi_provider_callback, &cb_data);
+ KTEST_NEQUAL(subscriber, NULL);
+ error = eventlog_subscriber_add_subscription(subscriber, "test_mp_d1",
+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, 0);
+ KTEST_EQUAL(error, 0);
+
+ s1 = eventlog_session_create(p1, 1, true, NULL, 0);
+ KTEST_NEQUAL(s1, NULL);
+ s2 = eventlog_session_create(p2, 2, true, NULL, 0);
+ KTEST_NEQUAL(s2, NULL);
+
+ /* Write an event from each */
+ eventlog_event_write(s1, 0x1001, EVENTLOG_LEVEL_INFO, 0xFFFFFFFF,
+ &payload, sizeof(payload));
+ eventlog_event_write(s2, 0x2001, EVENTLOG_LEVEL_INFO, 0xFFFFFFFF,
+ &payload, sizeof(payload));
+
+ /*
+ * Destroy s1 and subscriber, then p1.
+ * Subscriber must be destroyed before its providers so that
+ * subscription pointers are cleaned up first.
+ */
+ eventlog_session_destroy(s1);
+ eventlog_subscriber_destroy(subscriber);
+ eventlog_provider_destroy(p1);
+
+ /* p2 should now be disabled (no subscribers left) */
+ KTEST_EQUAL(eventlog_provider_get_level(p2), EVENTLOG_LEVEL_NONE);
+ KTEST_EQUAL(eventlog_provider_get_keywords(p2), 0);
+
+ /* Re-subscribe to verify p2 still works after p1 is gone */
+ memset(&cb_data, 0, sizeof(cb_data));
+ subscriber = eventlog_subscriber_create_callback(
+ multi_provider_callback, &cb_data);
+ KTEST_NEQUAL(subscriber, NULL);
+ error = eventlog_subscriber_add_subscription(subscriber, "test_mp_d1",
+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, 0);
+ KTEST_EQUAL(error, 0);
+
+ /* p2 should be enabled again */
+ KTEST_EQUAL(eventlog_provider_get_level(p2), EVENTLOG_LEVEL_VERBOSE);
+ KTEST_EQUAL(eventlog_provider_get_keywords(p2), 0xFFFFFFFF);
+
+ /* Events from p2 should arrive */
+ count_before = atomic_load_acq_32(&cb_data.event_count);
+ eventlog_event_write(s2, 0x2002, EVENTLOG_LEVEL_INFO, 0xFFFFFFFF,
+ &payload, sizeof(payload));
+ KTEST_EQUAL(atomic_load_acq_32(&cb_data.event_count), count_before + 1);
+
+ eventlog_session_destroy(s2);
+ eventlog_subscriber_destroy(subscriber);
+ eventlog_provider_destroy(p2);
+
+ return (0);
+}
+
+/*
+ * Dump state callback is invoked for each matching provider when subscribing
+ * by name with DUMP_STATE.
+ */
+static volatile uint32_t mp_dump_invocations;
+static struct eventlog_session *mp_dump_sessions[4];
+static int mp_dump_session_count;
+
+static void
+mp_test_dump_callback(struct eventlog_provider *provider __unused,
+ void *arg __unused)
+{
+ int i;
+
+ atomic_add_int(&mp_dump_invocations, 1);
+ for (i = 0; i < mp_dump_session_count; i++) {
+ if (mp_dump_sessions[i] != NULL &&
+ mp_dump_sessions[i]->effective_level >=
+ EVENTLOG_LEVEL_INFO) {
+ uint32_t data = 0xdead0000 | i;
+ eventlog_event_write(mp_dump_sessions[i], 0x200 + i,
+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF,
+ &data, sizeof(data));
+ }
+ }
+}
+
+KTEST_FUNC(multi_provider_dump_state)
+{
+ struct eventlog_provider *p1 = NULL, *p2 = NULL;
+ struct eventlog_session *s1 = NULL, *s2 = NULL;
+ struct eventlog_subscriber *subscriber = NULL;
+ struct multi_provider_callback_data cb_data;
+ uint32_t invocations, ec;
+ int ret = 0;
+ int error;
+
+ KTEST_LOG(ctx, "Testing dump state invoked for each matching provider");
+
+ mp_dump_invocations = 0;
+ mp_dump_session_count = 2;
+
+ p1 = test_create_provider("test_mp_ds", mp_test_dump_callback, NULL);
+ KTEST_NEQUAL(p1, NULL);
+ p2 = test_create_provider("test_mp_ds", mp_test_dump_callback, NULL);
+ KTEST_NEQUAL(p2, NULL);
+
+ s1 = eventlog_session_create(p1, 1, true, NULL, 0);
+ KTEST_NEQUAL(s1, NULL);
+ mp_dump_sessions[0] = s1;
+
+ s2 = eventlog_session_create(p2, 2, true, NULL, 0);
+ KTEST_NEQUAL(s2, NULL);
+ mp_dump_sessions[1] = s2;
+
+ memset(&cb_data, 0, sizeof(cb_data));
+ subscriber = eventlog_subscriber_create_callback(
+ multi_provider_callback, &cb_data);
+ KTEST_NEQUAL(subscriber, NULL);
+
+ error = eventlog_subscriber_add_subscription(subscriber, "test_mp_ds",
+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF,
+ EVENTLOG_SUBSCRIPTION_DUMP_STATE);
+ if (error != 0) {
+ KTEST_ERR(ctx, "FAIL: add_subscription returned %d", error);
+ ret = EINVAL;
+ goto cleanup;
+ }
+ KTEST_LOG(ctx, "PASS: error == 0");
+
+ /*
+ * Two providers share the name, so two dump tasks were enqueued.
+ * Drain so the invocation/event-count assertions below see the
+ * post-dump steady state.
+ */
+ eventlog_subscriber_drain_dumps(subscriber);
+
+ invocations = atomic_load_acq_32(&mp_dump_invocations);
+ KTEST_LOG(ctx, "Dump callback invoked %u times", invocations);
+ if (invocations < 2) {
+ KTEST_ERR(ctx, "FAIL: dump invocations %u < 2", invocations);
+ ret = EINVAL;
+ goto cleanup;
+ }
+ KTEST_LOG(ctx, "PASS: invocations >= 2");
+
+ ec = atomic_load_acq_32(&cb_data.event_count);
+ KTEST_LOG(ctx, "Subscriber received %u events", ec);
+ if (ec < 2) {
+ KTEST_ERR(ctx, "FAIL: event_count %u < 2", ec);
+ ret = EINVAL;
+ goto cleanup;
+ }
+ KTEST_LOG(ctx, "PASS: event_count >= 2");
+
+ if (atomic_load_acq_int(&cb_data.num_distinct_providers) != 2) {
+ KTEST_ERR(ctx, "FAIL: num_distinct_providers %d != 2",
+ atomic_load_acq_int(&cb_data.num_distinct_providers));
+ ret = EINVAL;
+ goto cleanup;
+ }
+ KTEST_LOG(ctx, "PASS: num_distinct_providers == 2");
+
+cleanup:
+ mp_dump_sessions[0] = NULL;
+ mp_dump_sessions[1] = NULL;
+ if (s1 != NULL)
+ eventlog_session_destroy(s1);
+ if (s2 != NULL)
+ eventlog_session_destroy(s2);
+ if (subscriber != NULL)
+ eventlog_subscriber_destroy(subscriber);
+ if (p1 != NULL)
+ eventlog_provider_destroy(p1);
+ if (p2 != NULL)
+ eventlog_provider_destroy(p2);
+
+ return (ret);
+}
+
+/*
+ * Two subscribers with different filters targeting same-named providers.
+ * Each provider instance gets its enablement from the union of all subscribers.
+ */
+KTEST_FUNC(multi_provider_independent_enablement)
+{
+ struct eventlog_provider *p1, *p2;
+ struct eventlog_subscriber *sub_name, *sub_other;
+ struct multi_provider_callback_data cb1, cb2;
+ int error;
+
+ KTEST_LOG(ctx, "Testing per-provider enablement with multi-provider");
+
+ p1 = test_create_provider("test_mp_ie", NULL, NULL);
+ KTEST_NEQUAL(p1, NULL);
+ p2 = test_create_provider("test_mp_ie", NULL, NULL);
+ KTEST_NEQUAL(p2, NULL);
+
+ /* Subscribe to "test_mp_ie" at INFO/0x3 - enables both providers */
+ memset(&cb1, 0, sizeof(cb1));
+ sub_name = eventlog_subscriber_create_callback(multi_provider_callback,
+ &cb1);
+ KTEST_NEQUAL(sub_name, NULL);
+ error = eventlog_subscriber_add_subscription(sub_name, "test_mp_ie",
+ EVENTLOG_LEVEL_INFO, 0x3, 0);
+ KTEST_EQUAL(error, 0);
+
+ KTEST_EQUAL(eventlog_provider_get_level(p1), EVENTLOG_LEVEL_INFO);
+ KTEST_EQUAL(eventlog_provider_get_level(p2), EVENTLOG_LEVEL_INFO);
+ KTEST_EQUAL(eventlog_provider_get_keywords(p1), 0x3);
+ KTEST_EQUAL(eventlog_provider_get_keywords(p2), 0x3);
+
+ /* Add second subscriber at VERBOSE/0xC - both get union */
+ memset(&cb2, 0, sizeof(cb2));
+ sub_other = eventlog_subscriber_create_callback(multi_provider_callback,
+ &cb2);
+ KTEST_NEQUAL(sub_other, NULL);
+ error = eventlog_subscriber_add_subscription(sub_other, "test_mp_ie",
+ EVENTLOG_LEVEL_VERBOSE, 0xC, 0);
+ KTEST_EQUAL(error, 0);
+
+ KTEST_EQUAL(eventlog_provider_get_level(p1), EVENTLOG_LEVEL_VERBOSE);
+ KTEST_EQUAL(eventlog_provider_get_level(p2), EVENTLOG_LEVEL_VERBOSE);
+ KTEST_EQUAL(eventlog_provider_get_keywords(p1), 0xF); /* 0x3 | 0xC */
+ KTEST_EQUAL(eventlog_provider_get_keywords(p2), 0xF);
+
+ /* Remove first subscriber - enablement drops to VERBOSE/0xC */
+ eventlog_subscriber_destroy(sub_name);
+
+ KTEST_EQUAL(eventlog_provider_get_level(p1), EVENTLOG_LEVEL_VERBOSE);
+ KTEST_EQUAL(eventlog_provider_get_level(p2), EVENTLOG_LEVEL_VERBOSE);
+ KTEST_EQUAL(eventlog_provider_get_keywords(p1), 0xC);
+ KTEST_EQUAL(eventlog_provider_get_keywords(p2), 0xC);
+
+ /* Remove second subscriber - both disabled */
+ eventlog_subscriber_destroy(sub_other);
+
+ KTEST_EQUAL(eventlog_provider_get_level(p1), EVENTLOG_LEVEL_NONE);
+ KTEST_EQUAL(eventlog_provider_get_level(p2), EVENTLOG_LEVEL_NONE);
+
+ eventlog_provider_destroy(p1);
+ eventlog_provider_destroy(p2);
+
+ return (0);
+}
+
+/*
+ * Device subscriber receives events from all same-named providers.
+ * Verifies multi-provider support works with device (buffered) subscribers,
+ * not just callback subscribers.
+ */
+KTEST_FUNC(multi_provider_device_subscriber)
+{
+ struct eventlog_provider *p1, *p2;
+ struct eventlog_session *s1, *s2;
+ struct eventlog_subscriber *subscriber;
+ uint32_t payload = 0xFACE;
+ char read_buf[8 * 1024];
+ size_t total_read;
+ struct eventlog_event_header *hdr;
+ uint16_t seen_ids[2] = {0, 0};
+ int num_distinct = 0;
+ int total_events = 0;
+ size_t offset;
+ int error, i;
+
+ KTEST_LOG(ctx,
+ "Testing device subscriber with multiple same-named providers");
+
+ p1 = test_create_provider("test_mp_dev", NULL, NULL);
+ KTEST_NEQUAL(p1, NULL);
+ p2 = test_create_provider("test_mp_dev", NULL, NULL);
+ KTEST_NEQUAL(p2, NULL);
+
+ subscriber = eventlog_subscriber_create_device(
+ EVENTLOG_SUBSCRIBER_BUFFER_SIZE_DEFAULT);
+ KTEST_NEQUAL(subscriber, NULL);
+ error = eventlog_subscriber_add_subscription(subscriber, "test_mp_dev",
+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, 0);
+ KTEST_EQUAL(error, 0);
+
+ s1 = eventlog_session_create(p1, 1, true, NULL, 0);
+ KTEST_NEQUAL(s1, NULL);
+ s2 = eventlog_session_create(p2, 2, true, NULL, 0);
+ KTEST_NEQUAL(s2, NULL);
+
+ /* Write events from each provider */
+ for (i = 0; i < 5; i++) {
+ eventlog_event_write(s1, 0x1000 + i, EVENTLOG_LEVEL_INFO,
+ 0xFFFFFFFF, &payload, sizeof(payload));
+ eventlog_event_write(s2, 0x2000 + i, EVENTLOG_LEVEL_INFO,
+ 0xFFFFFFFF, &payload, sizeof(payload));
+ }
+
+ /* Read all available events */
+ total_read = eventlog_read_into_buf(subscriber, read_buf,
+ sizeof(read_buf), FNONBLOCK);
+ KTEST_VERIFY(total_read > 0);
+
+ /* Parse events and count distinct provider_ids */
+ offset = 0;
+ while (offset + sizeof(struct eventlog_event_header) <= total_read) {
+ bool found;
+
+ hdr = (struct eventlog_event_header *)(read_buf + offset);
+ if (hdr->event_length < sizeof(struct eventlog_event_header) ||
+ offset + hdr->event_length > total_read)
+ break;
+
+ total_events++;
+ found = false;
+ for (i = 0; i < num_distinct; i++) {
+ if (seen_ids[i] == hdr->provider_id) {
+ found = true;
+ break;
+ }
+ }
+ if (!found && num_distinct < 2) {
+ seen_ids[num_distinct++] = hdr->provider_id;
+ }
+ offset += hdr->event_length;
+ }
+
+ KTEST_LOG(ctx, "Read %d events from %d distinct provider_ids",
+ total_events, num_distinct);
+
+ /* 2 SESSION_CREATEs + 10 user events = 12 total */
+ KTEST_EQUAL(total_events, 12);
+ KTEST_EQUAL(num_distinct, 2);
+
+ eventlog_session_destroy(s1);
+ eventlog_session_destroy(s2);
+ eventlog_subscriber_destroy(subscriber);
+ eventlog_provider_destroy(p1);
+ eventlog_provider_destroy(p2);
+
+ return (0);
+}
+
+/*
+ * Helpers + tests for the subscribers_changed provider callback and
+ * eventlog_provider_config (NULL config, default_enabled, etc).
+ *
+ * Contract for subscribers_changed: fires exactly once per real
+ * 0<->N transition, runs without sessions_lock so the callback may
+ * sleep, NULL is a safe "no callback" value.
+ */
+
+struct subch_count {
+ volatile int n_true; /* callbacks with has_subscribers=true */
+ volatile int n_false; /* ...with has_subscribers=false */
+ volatile int last_state;
+};
+
+static void
+test_subch_count_cb(struct eventlog_provider *provider __unused,
+ bool has_subscribers, void *arg)
+{
+ struct subch_count *c = arg;
+
+ if (has_subscribers)
+ atomic_add_int(&c->n_true, 1);
+ else
+ atomic_add_int(&c->n_false, 1);
+ atomic_store_rel_32((volatile uint32_t *)&c->last_state,
+ has_subscribers ? 1 : 0);
+}
+
+KTEST_FUNC(subscribers_changed_basic)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_subscriber *sub1, *sub2;
+ struct test_callback_data *cb1, *cb2;
+ struct subch_count c = { 0, 0, 0 };
+ struct eventlog_provider_config cfg = {
+ .subscribers_changed = test_subch_count_cb,
+ .subscribers_changed_arg = &c,
+ };
+
+ KTEST_LOG(ctx, "subscribers_changed fires exactly once per 0<->N edge");
+
+ provider = eventlog_provider_create("test_subch_basic", &cfg);
+ KTEST_NEQUAL(provider, NULL);
+
+ /* No subscriber yet -> no callback ever fired. */
+ KTEST_EQUAL(c.n_true, 0);
+ KTEST_EQUAL(c.n_false, 0);
+
+ /* First subscriber: 0->1 transition, expect one (true). */
+ sub1 = test_enable_provider_callback("test_subch_basic",
+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF, &cb1);
+ KTEST_NEQUAL(sub1, NULL);
+ KTEST_EQUAL(c.n_true, 1);
+ KTEST_EQUAL(c.n_false, 0);
+ KTEST_EQUAL(c.last_state, 1);
+
+ /* Second subscriber: 1->2, no transition, no callback. */
+ sub2 = test_enable_provider_callback("test_subch_basic",
+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF, &cb2);
+ KTEST_NEQUAL(sub2, NULL);
+ KTEST_EQUAL(c.n_true, 1);
+ KTEST_EQUAL(c.n_false, 0);
+
+ /* Drop one subscriber: 2->1, no transition, no callback. */
+ eventlog_subscriber_destroy(sub2);
+ mtx_destroy(&cb2->lock);
+ free(cb2, M_EVENTLOG_TEST);
+ KTEST_EQUAL(c.n_true, 1);
+ KTEST_EQUAL(c.n_false, 0);
+
+ /* Drop the last subscriber: 1->0, expect one (false). */
+ eventlog_subscriber_destroy(sub1);
+ mtx_destroy(&cb1->lock);
+ free(cb1, M_EVENTLOG_TEST);
+ KTEST_EQUAL(c.n_true, 1);
+ KTEST_EQUAL(c.n_false, 1);
+ KTEST_EQUAL(c.last_state, 0);
+
+ eventlog_provider_destroy(provider);
+ return (0);
+}
+
+KTEST_FUNC(subscribers_changed_null_safe)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_subscriber *sub;
+ struct test_callback_data *cb;
+ struct eventlog_provider_config cfg = {
+ /* subscribers_changed deliberately NULL */
+ };
+
+ KTEST_LOG(ctx, "NULL subscribers_changed is a safe no-op");
+
+ provider = eventlog_provider_create("test_subch_null", &cfg);
+ KTEST_NEQUAL(provider, NULL);
+
+ /* Exercise sub/unsub cycle; NULL callback should not crash. */
+ sub = test_enable_provider_callback("test_subch_null",
+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF, &cb);
+ KTEST_NEQUAL(sub, NULL);
+
+ eventlog_subscriber_destroy(sub);
+ mtx_destroy(&cb->lock);
+ free(cb, M_EVENTLOG_TEST);
+
+ eventlog_provider_destroy(provider);
+ return (0);
+}
+
+/*
+ * subscribers_changed_runs_unlocked: prove the callback is invoked in a
+ * context where the caller is permitted to sleep / take its own
+ * sleepable locks. If sessions_lock (or any non-sleepable lock) were
+ * held when the callback fired, sx_xlock + pause_sbt would WITNESS- /
+ * INVARIANTS-fail with "sleeping with mutex held".
+ */
+struct subch_unlocked_state {
+ struct sx outer;
+ int n;
+};
+
+static void
+test_subch_unlocked_cb(struct eventlog_provider *provider __unused,
+ bool has_subscribers __unused, void *arg)
+{
+ struct subch_unlocked_state *s = arg;
+
+ MPASS(THREAD_CAN_SLEEP());
+ sx_xlock(&s->outer);
+ /*
+ * Sleep one tick. WITNESS / INVARIANTS will fire if any
+ * non-sleepable lock is held (most importantly sessions_lock).
+ */
+ pause("subch", 1);
+ sx_xunlock(&s->outer);
+ atomic_add_int(&s->n, 1);
+}
+
+KTEST_FUNC(subscribers_changed_runs_unlocked)
+{
+ struct eventlog_provider *provider;
+ struct eventlog_subscriber *sub;
+ struct test_callback_data *cb;
+ struct subch_unlocked_state s;
+ struct eventlog_provider_config cfg;
+
+ KTEST_LOG(ctx,
+ "callback runs outside sessions_lock (sleepable context)");
+
+ bzero(&s, sizeof(s));
+ sx_init(&s.outer, "test_subch_outer");
+ cfg = (struct eventlog_provider_config){
+ .subscribers_changed = test_subch_unlocked_cb,
+ .subscribers_changed_arg = &s,
+ };
+
+ provider = eventlog_provider_create("test_subch_unlocked", &cfg);
+ KTEST_NEQUAL(provider, NULL);
+
+ sub = test_enable_provider_callback("test_subch_unlocked",
+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF, &cb);
+ KTEST_NEQUAL(sub, NULL);
+ KTEST_EQUAL(s.n, 1);
+
+ eventlog_subscriber_destroy(sub);
+ mtx_destroy(&cb->lock);
+ free(cb, M_EVENTLOG_TEST);
+ KTEST_EQUAL(s.n, 2);
+
+ eventlog_provider_destroy(provider);
+ sx_destroy(&s.outer);
+ return (0);
+}
+
+/*
+ * Concurrent subscribe / unsubscribe storm. Smoke test for
+ * eventlog_update_provider_enablement under contention. Starts and
+ * ends at the no-subscribers state, so n_true must equal n_false at
+ * quiesce; INVARIANTS-only MPASS checks backstop subtler races.
+ */
+struct subch_storm_args {
+ struct eventlog_provider *provider;
+ const char *provider_name;
+ int *stop;
+ int iterations_done;
+ int exited;
+};
+
+static void
+test_subch_storm_thread(void *arg)
+{
+ struct subch_storm_args *a = arg;
+ struct eventlog_subscriber *sub;
+ struct test_callback_data *cb;
+ int i = 0;
+
+ while (atomic_load_acq_32((volatile uint32_t *)a->stop) == 0) {
+ sub = test_enable_provider_callback(a->provider_name,
+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF, &cb);
+ if (sub == NULL)
+ break;
+ eventlog_subscriber_destroy(sub);
+ mtx_destroy(&cb->lock);
+ free(cb, M_EVENTLOG_TEST);
+ i++;
+ kern_yield(PRI_UNCHANGED);
+ }
+ atomic_store_rel_32((volatile uint32_t *)&a->iterations_done, i);
+ atomic_store_rel_32((volatile uint32_t *)&a->exited, 1);
+ wakeup(&a->exited);
+ kthread_exit();
+}
+
+static void
+test_subch_stop_callout(void *arg)
+{
+ int *stop = arg;
+
+ atomic_store_rel_32((volatile uint32_t *)stop, 1);
+ wakeup(stop);
+}
+
+KTEST_FUNC(subscribers_changed_concurrent_subunsub)
+{
+#define SUBCH_NTHREADS 8
+#define SUBCH_RUNTIME_S 1
+ struct eventlog_provider *provider;
+ struct subch_count c = { 0, 0, 0 };
+ struct subch_storm_args args[SUBCH_NTHREADS];
+ struct thread *threads[SUBCH_NTHREADS];
+ struct callout stop_co;
+ int stop = 0;
+ int i, error;
+ int total_iterations;
+ struct eventlog_provider_config cfg = {
+ .subscribers_changed = test_subch_count_cb,
+ .subscribers_changed_arg = &c,
+ };
+
+ KTEST_LOG(ctx, "concurrent sub/unsub: %d threads x %d s, no phantom "
+ "transitions, n_true == n_false at quiesce",
+ SUBCH_NTHREADS, SUBCH_RUNTIME_S);
+
+ provider = eventlog_provider_create("test_subch_storm", &cfg);
+ KTEST_NEQUAL(provider, NULL);
+
+ for (i = 0; i < SUBCH_NTHREADS; i++) {
+ bzero(&args[i], sizeof(args[i]));
+ args[i].provider = provider;
+ args[i].provider_name = "test_subch_storm";
+ args[i].stop = &stop;
+ error = kthread_add(test_subch_storm_thread, &args[i], NULL,
+ &threads[i], 0, 0, "subch_storm_%d", i);
+ KTEST_EQUAL(error, 0);
+ }
+
+ callout_init(&stop_co, 1);
+ callout_reset(&stop_co, hz * SUBCH_RUNTIME_S, test_subch_stop_callout,
+ &stop);
+
+ for (i = 0; i < SUBCH_NTHREADS; i++) {
+ while (atomic_load_acq_32(
+ (volatile uint32_t *)&args[i].exited) == 0)
+ tsleep(&args[i].exited, 0, "subch_w", hz / 10);
+ }
+ callout_drain(&stop_co);
+
+ total_iterations = 0;
+ for (i = 0; i < SUBCH_NTHREADS; i++)
+ total_iterations += args[i].iterations_done;
+ KTEST_LOG(ctx, "total sub/unsub iterations: %d, n_true=%d n_false=%d",
+ total_iterations, c.n_true, c.n_false);
+
+ /*
+ * All subscribers are gone, so every 0->N edge (n_true) must
+ * have a matching N->0 edge (n_false). Without locking around
+ * the recount, races could produce unbalanced counts.
+ */
+ KTEST_VERIFY(c.n_true > 0);
+ KTEST_VERIFY(c.n_false > 0);
+ KTEST_EQUAL(c.n_true, c.n_false);
+ KTEST_EQUAL(c.last_state, 0);
+
+ eventlog_provider_destroy(provider);
+ return (0);
+#undef SUBCH_NTHREADS
+#undef SUBCH_RUNTIME_S
+}
+
+/*
+ * NULL config must be equivalent to a zero-initialised struct: no
+ * callbacks, default_enabled == 0.
+ */
+KTEST_FUNC(provider_config_null_equivalent)
+{
+ struct eventlog_provider *p_null, *p_zero;
+ struct eventlog_session *s_null, *s_zero;
+ struct eventlog_provider_config cfg_zero = { 0 };
+
+ KTEST_LOG(ctx, "NULL config behaves identically to {0}");
+
+ p_null = eventlog_provider_create("test_cfg_null", NULL);
+ KTEST_NEQUAL(p_null, NULL);
+ p_zero = eventlog_provider_create("test_cfg_zero", &cfg_zero);
+ KTEST_NEQUAL(p_zero, NULL);
+
+ /* Both providers default to disabled (default_enabled == 0). */
+ KTEST_EQUAL(eventlog_provider_get_default(p_null), 0);
+ KTEST_EQUAL(eventlog_provider_get_default(p_zero), 0);
+
+ /* Sessions on either start disabled. */
+ s_null = eventlog_session_create(p_null, 0, true, NULL, 0);
+ KTEST_NEQUAL(s_null, NULL);
+ KTEST_EQUAL(eventlog_session_is_enabled(s_null), 0);
+ s_zero = eventlog_session_create(p_zero, 0, true, NULL, 0);
+ KTEST_NEQUAL(s_zero, NULL);
+ KTEST_EQUAL(eventlog_session_is_enabled(s_zero), 0);
+
+ eventlog_session_destroy(s_null);
+ eventlog_session_destroy(s_zero);
+ eventlog_provider_destroy(p_null);
+ eventlog_provider_destroy(p_zero);
+ return (0);
+}
+
+/*
+ * cfg.default_enabled = 1 must cause sessions to start enabled
+ * without an explicit eventlog_session_set_enabled call. Same shape
+ * with default_enabled = 0 must start disabled.
+ */
+KTEST_FUNC(provider_config_default_enabled)
+{
+ struct eventlog_provider *p_on, *p_off;
+ struct eventlog_session *s_on, *s_off;
+ struct eventlog_provider_config cfg_on = { .default_enabled = 1 };
+ struct eventlog_provider_config cfg_off = { .default_enabled = 0 };
+
+ KTEST_LOG(ctx, "cfg.default_enabled controls session start state");
+
+ p_on = eventlog_provider_create("test_cfg_def_on", &cfg_on);
+ KTEST_NEQUAL(p_on, NULL);
+ KTEST_EQUAL(eventlog_provider_get_default(p_on), 1);
+
+ p_off = eventlog_provider_create("test_cfg_def_off", &cfg_off);
+ KTEST_NEQUAL(p_off, NULL);
+ KTEST_EQUAL(eventlog_provider_get_default(p_off), 0);
+
+ s_on = eventlog_session_create(p_on, 0, true, NULL, 0);
+ KTEST_NEQUAL(s_on, NULL);
+ KTEST_EQUAL(eventlog_session_is_enabled(s_on), 1);
+
+ s_off = eventlog_session_create(p_off, 0, true, NULL, 0);
+ KTEST_NEQUAL(s_off, NULL);
+ KTEST_EQUAL(eventlog_session_is_enabled(s_off), 0);
+
+ eventlog_session_destroy(s_on);
+ eventlog_session_destroy(s_off);
+ eventlog_provider_destroy(p_on);
+ eventlog_provider_destroy(p_off);
+ return (0);
+}
+
+static const struct ktest_test_info tests[] = {
+ KTEST_INFO(provider_init_cleanup),
+ KTEST_INFO(session_create_destroy),
+ KTEST_INFO(event_logging_basic),
+ KTEST_INFO(event_logging_multiple),
+ KTEST_INFO(provider_independence),
+ KTEST_INFO(event_data_integrity),
+ KTEST_INFO(event_size_variations),
+ KTEST_INFO(multithreaded_logging),
+ KTEST_INFO(subscriber_create_destroy),
+ KTEST_INFO(subscriber_create_device_invalid_size),
+ KTEST_INFO(subscriber_add_subscription_nonexistent_provider),
+ KTEST_INFO(subscriber_read_error_paths),
+ KTEST_INFO(null_pointer_destroy),
+ KTEST_INFO(subscriber_level_keyword_filtering),
+ KTEST_INFO(event_oversized_dropped),
+ KTEST_INFO(event_edge_cases_payload_session),
+ KTEST_INFO(subscriber_subscription_update_in_place),
+ KTEST_INFO(subscriber_multiple_subscribers),
+ KTEST_INFO(subscriber_provider_enablement_aggregation),
+ KTEST_INFO(subscriber_device_buffer),
+ KTEST_INFO(subscriber_circular_buffer),
+ KTEST_INFO(subscriber_double_buffer_race),
+ KTEST_INFO(subscriber_mid_read_swap),
+ KTEST_INFO(subscriber_buffer_boundary_stress),
+ KTEST_INFO(subscriber_buffer_fill_to_capacity),
+ KTEST_INFO(subscriber_rapid_swap_stress),
+ KTEST_INFO(subscriber_callback),
+ KTEST_INFO(schema_generated_macros),
+ KTEST_INFO(schema_varlen_event),
+ KTEST_INFO(event_write_gather),
+ KTEST_INFO(lockfree_many_concurrent_writers),
+ KTEST_INFO(lockfree_writer_swap_contention),
+ KTEST_INFO(lockfree_buffer_full_contention),
+ KTEST_INFO(lockfree_data_integrity_under_contention),
+ KTEST_INFO(lockfree_reader_writer_swap_race),
+ KTEST_INFO(timestamp_epoch_boundary),
+ KTEST_INFO(timestamp_epoch_normal_delivery),
+ KTEST_INFO(timestamp_epoch_small_uio),
+ KTEST_INFO(dump_state_basic),
+ KTEST_INFO(dump_state_routing),
+ KTEST_INFO(dump_state_no_callback),
+ KTEST_INFO(dump_state_curvnet_not_set),
+ KTEST_INFO(dump_state_disabled_sessions),
+ KTEST_INFO(dump_state_async_runs_off_caller_thread),
+ KTEST_INFO(dump_state_async_subscribe_returns_before_dump),
+ KTEST_INFO(dump_state_destroy_waits_for_dump),
+ KTEST_INFO(dump_state_resubscribe_no_refire),
+ KTEST_INFO(dump_state_emits_dump_complete),
+ KTEST_INFO(multi_provider_subscribe_enables_all),
+ KTEST_INFO(multi_provider_events_from_both),
+ KTEST_INFO(multi_provider_destroy_one),
+ KTEST_INFO(multi_provider_dump_state),
+ KTEST_INFO(multi_provider_independent_enablement),
+ KTEST_INFO(multi_provider_device_subscriber),
+ KTEST_INFO(subscribers_changed_basic),
+ KTEST_INFO(subscribers_changed_null_safe),
+ KTEST_INFO(subscribers_changed_runs_unlocked),
+ KTEST_INFO(subscribers_changed_concurrent_subunsub),
+ KTEST_INFO(provider_config_null_equivalent),
+ KTEST_INFO(provider_config_default_enabled),
+};
+
+KTEST_MODULE_DECLARE(ktest_eventlog, tests);
+
diff --git a/sys/modules/ktest/Makefile b/sys/modules/ktest/Makefile
--- a/sys/modules/ktest/Makefile
+++ b/sys/modules/ktest/Makefile
@@ -1,4 +1,5 @@
SUBDIR= ktest \
+ ktest_eventlog \
ktest_example \
ktest_netlink_message_writer \
ktest_tcphpts
diff --git a/sys/modules/ktest/ktest_eventlog/Makefile b/sys/modules/ktest/ktest_eventlog/Makefile
new file mode 100644
--- /dev/null
+++ b/sys/modules/ktest/ktest_eventlog/Makefile
@@ -0,0 +1,15 @@
+PACKAGE= tests
+WARNS?= 6
+
+SYSDIR?=${SRCTOP}/sys
+.include "${SYSDIR}/conf/kern.opts.mk"
+
+.PATH: ${SYSDIR}/kern
+
+KMOD= ktest_eventlog
+SRCS= kern_eventlog_test.c
+
+EVENTLOG_SCHEMA= test_eventlog_schema.src
+
+.include <bsd.kmod.mk>
+
diff --git a/sys/sys/eventlog.h b/sys/sys/eventlog.h
new file mode 100644
--- /dev/null
+++ b/sys/sys/eventlog.h
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2026 Netflix, Inc.
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#ifndef _SYS_EVENTLOG_H_
+#define _SYS_EVENTLOG_H_
+
+#include <sys/types.h>
+#include <sys/cdefs.h>
+
+/* Maximum provider name length */
+#define EVENTLOG_PROVIDER_NAME_MAX 32
+
+/*
+ * Keyword for session lifecycle events (reserved; provider schemas use
+ * KEYWORD SESSION 1).
+ */
+#define EVENTLOG_KEYWORD_SESSION 0x80000000
+
+/*
+ * Reserved event IDs (all providers). SESSION_CREATE / SESSION_END mark
+ * each session's lifetime; DUMP_COMPLETE is synthesised once per
+ * (subscriber, provider) at the end of an async dump_state replay,
+ * with session_id == EVENTLOG_SESSION_ID_NONE.
+ */
+#define EVENTLOG_SESSION_END_ID ((uint32_t)-1) /* UINT32_MAX */
+#define EVENTLOG_SESSION_CREATE_ID ((uint32_t)-2) /* UINT32_MAX - 1 */
+#define EVENTLOG_DUMP_COMPLETE_ID ((uint32_t)-3) /* UINT32_MAX - 2 */
+
+/* Sentinel session_id for framework events not tied to a session. */
+#define EVENTLOG_SESSION_ID_NONE ((uint64_t)-1) /* UINT64_MAX */
+
+/* Event log levels */
+enum eventlog_level {
+ EVENTLOG_LEVEL_NONE,
+ EVENTLOG_LEVEL_ERROR,
+ EVENTLOG_LEVEL_WARN,
+ EVENTLOG_LEVEL_INFO,
+ EVENTLOG_LEVEL_VERBOSE,
+ EVENTLOG_LEVEL_TRACE
+};
+
+#ifdef _KERNEL
+
+#include <sys/queue.h>
+#include <sys/sysctl.h>
+#include <vm/uma.h>
+#include <sys/mutex.h>
+#include <machine/atomic.h>
+
+/* Event log provider structure */
+struct eventlog_provider;
+
+/* Session with exposed level/keywords for _ENABLED checks */
+#ifndef EVENTLOG_INTERNAL
+struct eventlog_session {
+ enum eventlog_level effective_level; /* Cached for _ENABLED macro */
+ uint32_t effective_keywords; /* Cached for _ENABLED macro */
+};
+#else
+struct eventlog_session; /* Full definition in kern_eventlog.c */
+#endif
+
+/*
+ * Optional callback invoked when a subscriber subscribes with
+ * EVENTLOG_SUBSCRIPTION_DUMP_STATE. The provider should emit current
+ * state for all its sessions using the normal event write APIs; the
+ * framework routes those writes to the requesting subscriber only.
+ *
+ * Runs asynchronously on the eventlog dump taskqueue after the
+ * subscribe call has returned. The taskqueue is single-threaded, so
+ * concurrent invocations of the same callback never overlap. Callers
+ * that need to observe the post-dump state can call
+ * eventlog_subscriber_drain_dumps().
+ */
+typedef void (*eventlog_provider_dump_state_t)(
+ struct eventlog_provider *provider, void *arg);
+
+/*
+ * Optional callback invoked when the provider's default_enabled sysctl changes.
+ * value is the raw sysctl value: 0, 1, -1 (disable all then set 0),
+ * or 2 (enable all then set 1).
+ * When value is -1 or 2, the framework does NOT iterate sessions for this
+ * provider; the callback is responsible for enabling/disabling sessions itself.
+ * When value is 0 or 1, this is informational only (default changed).
+ */
+typedef void (*eventlog_default_changed_t)(
+ struct eventlog_provider *provider, int value, void *arg);
+
+/*
+ * Optional callback invoked when a provider transitions between "no
+ * subscribers" and "at least one subscriber". has_subscribers is the
+ * new state. Useful for gating expensive setup that only needs to run
+ * while a consumer is listening. May sleep and take other locks; must
+ * not re-enter the eventlog framework.
+ */
+typedef void (*eventlog_subscribers_changed_t)(
+ struct eventlog_provider *provider, bool has_subscribers, void *arg);
+
+/*
+ * Optional configuration for eventlog_provider_create. NULL or a
+ * zero-initialised struct yields no callbacks and disabled-by-default
+ * sessions. default_enabled seeds kern.eventlog.<name>.default; an
+ * explicit tunable still wins.
+ */
+struct eventlog_provider_config {
+ eventlog_provider_dump_state_t dump_callback;
+ void *dump_callback_arg;
+ eventlog_default_changed_t default_changed;
+ void *default_changed_arg;
+ eventlog_subscribers_changed_t subscribers_changed;
+ void *subscribers_changed_arg;
+ int default_enabled;
+};
+
+/*
+ * Create and register a new eventlog provider.
+ * config: Optional; NULL is equivalent to a zero-initialised config.
+ */
+struct eventlog_provider *eventlog_provider_create(const char *name,
+ const struct eventlog_provider_config *config);
+
+/*
+ * Unregister and destroy an eventlog provider.
+ */
+void eventlog_provider_destroy(struct eventlog_provider *provider);
+
+/*
+ * Query provider level and keywords (for testing/debugging).
+ */
+enum eventlog_level eventlog_provider_get_level(
+ struct eventlog_provider *provider);
+uint32_t eventlog_provider_get_keywords(struct eventlog_provider *provider);
+
+/*
+ * Query the provider's default_enabled setting (from
+ * kern.eventlog.<name>.default). Returns 0 (sessions start disabled) or 1
+ * (sessions start enabled).
+ */
+int eventlog_provider_get_default(struct eventlog_provider *provider);
+
+/*
+ * Set the provider's default_enabled value programmatically. This does NOT
+ * iterate existing sessions; only affects future session creates.
+ */
+void eventlog_provider_set_default(struct eventlog_provider *provider,
+ int value);
+
+/*
+ * Return the provider's auto-generated kern.eventlog.<name> sysctl node and
+ * its context list. Providers may attach children (e.g. kern.eventlog.cpu.hz);
+ * the framework owns the storage so children must not outlive the provider.
+ */
+struct sysctl_oid;
+struct sysctl_ctx_list;
+struct sysctl_oid *eventlog_provider_get_sysctl_node(
+ struct eventlog_provider *provider);
+struct sysctl_ctx_list *eventlog_provider_get_sysctl_ctx(
+ struct eventlog_provider *provider);
+
+/*
+ * Create a new eventlog session.
+ * session_id: Unique identifier (e.g., inp_gencnt for TCP per-connection
+ * sessions).
+ * waitok: If true, use M_WAITOK for allocations; else M_NOWAIT.
+ * create_payload: Optional provider-specific payload for SESSION_CREATE. If
+ * NULL, uses default (created_at only). Otherwise must match provider's
+ * SESSION_CREATE struct.
+ * create_payload_size: Size of create_payload, or 0 if NULL.
+ *
+ * The session's initial enabled state is derived from the provider's
+ * default_enabled sysctl (kern.eventlog.<name>.default). SESSION_CREATE is
+ * only emitted when enabled.
+ */
+struct eventlog_session *eventlog_session_create(
+ struct eventlog_provider *provider, uint64_t session_id, bool waitok,
+ void *create_payload, size_t create_payload_size);
+
+/*
+ * Destroy an eventlog session.
+ */
+void eventlog_session_destroy(struct eventlog_session *session);
+
+/*
+ * Enable or disable a session. When disabled, effective_level is set to
+ * EVENTLOG_LEVEL_NONE so the _ENABLED check fails. When enabled, effective
+ * values are restored from provider (or session override).
+ */
+void eventlog_session_set_enabled(struct eventlog_session *session,
+ int enabled);
+
+/*
+ * Returns non-zero if session is enabled, 0 if disabled or NULL.
+ */
+int eventlog_session_is_enabled(struct eventlog_session *session);
+
+/*
+ * Set per-session level/keywords override. When set, effective values use
+ * this instead of provider. Use eventlog_session_set_enabled(s, true) after
+ * to apply. Level NONE or keywords 0 disables the session.
+ */
+void eventlog_session_set_filter(struct eventlog_session *session,
+ enum eventlog_level level, uint32_t keywords);
+
+/*
+ * Write an event directly to all relevant subscribers.
+ */
+void eventlog_event_write(struct eventlog_session *session, uint32_t id,
+ enum eventlog_level level, uint32_t keywords, void *buffer, size_t length);
+
+/*
+ * Same as eventlog_event_write but use a pre-computed timestamp (microseconds
+ * since boot). Use when the caller already queried time (e.g.
+ * session->created_at for SESSION_CREATE).
+ */
+void eventlog_event_write_at(struct eventlog_session *session, uint32_t id,
+ enum eventlog_level level, uint32_t keywords, void *buffer, size_t length,
+ uint64_t timestamp_us);
+
+/*
+ * Scatter/gather variants. The payload is the concatenation of iovcnt
+ * iovec entries (zero-length entries and iovcnt == 0 legal). Avoids an
+ * intermediate copy when the event has a variable-length tail.
+ */
+struct iovec;
+void eventlog_event_write_gather(struct eventlog_session *session, uint32_t id,
+ enum eventlog_level level, uint32_t keywords,
+ const struct iovec *iov, int iovcnt);
+void eventlog_event_write_gather_at(struct eventlog_session *session,
+ uint32_t id, enum eventlog_level level, uint32_t keywords,
+ const struct iovec *iov, int iovcnt, uint64_t timestamp_us);
+
+#endif /* _KERNEL */
+
+#endif /* _SYS_EVENTLOG_H_ */
diff --git a/sys/sys/eventlog_subscriber.h b/sys/sys/eventlog_subscriber.h
new file mode 100644
--- /dev/null
+++ b/sys/sys/eventlog_subscriber.h
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2026 Netflix, Inc.
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#ifndef _SYS_EVENTLOG_SUBSCRIBER_H_
+#define _SYS_EVENTLOG_SUBSCRIBER_H_
+
+#include <sys/types.h>
+#include <sys/cdefs.h>
+#include <sys/eventlog.h>
+#include <sys/ioccom.h>
+
+/* Event header structure (naturally aligned, 32 bytes) */
+struct eventlog_event_header {
+ uint16_t event_length; /* Total size including this header */
+ uint16_t cpu; /* CPU ID */
+ uint16_t provider_id; /* Provider's unique ID */
+ uint16_t RESERVED; /* Write to zero, do not read */
+ uint64_t timestamp; /* Timestamp in microseconds */
+ uint64_t session_id; /* Session ID */
+ uint32_t event_id; /* Event ID */
+ lwpid_t thread_id; /* Thread ID */
+};
+
+/* Subscriber type enum */
+enum eventlog_subscriber_type {
+ EVENTLOG_SUBSCRIBER_TYPE_DEVICE,
+ EVENTLOG_SUBSCRIBER_TYPE_CALLBACK
+};
+
+/*
+ * Per-subscription flags. Unknown bits are rejected with EINVAL by
+ * eventlog_subscriber_add_subscription() so new flags can be added
+ * later without silent breakage on old kernels.
+ *
+ * EVENTLOG_SUBSCRIPTION_DUMP_STATE: opt in to a replay of the
+ * provider's current state. The framework enqueues an asynchronous
+ * dump task for each newly-subscribed provider with a dump_callback;
+ * events flow on the normal delivery path. See
+ * eventlog_subscriber_drain_dumps() to wait for completion.
+ */
+#define EVENTLOG_SUBSCRIPTION_DUMP_STATE 0x00000001
+#define EVENTLOG_SUBSCRIPTION_FLAGS_VALID \
+ (EVENTLOG_SUBSCRIPTION_DUMP_STATE)
+
+/* Subscription request structure (for ioctl) */
+struct eventlog_subscription_req {
+ enum eventlog_level level;
+ uint32_t keywords;
+ uint32_t flags;
+ char provider_name[EVENTLOG_PROVIDER_NAME_MAX];
+};
+
+/* Per-CPU buffer size limits (30-bit commit_pos in packed_state) */
+#define EVENTLOG_BUFFER_SIZE_MIN (64 * 1024) /* 64 KB */
+#define EVENTLOG_BUFFER_SIZE_MAX ((1 << 30) - 1) /* ~1 GB */
+
+/* CREATE request: creates subscriber and subscribes to providers */
+struct eventlog_create_req {
+ uint32_t buffer_size_per_cpu; /* Buffer size per CPU */
+ uint32_t count; /* Number of subscriptions */
+ /* Variable-length array of subscription requests. */
+ struct eventlog_subscription_req subscriptions[];
+};
+
+/* Stats structure for GET_STATS IOCTL */
+struct eventlog_stats {
+ uint64_t dropped_events; /* Events dropped due to buffer full */
+};
+
+/*
+ * Provider info for GET_PROVIDERS IOCTL - returns subscribed providers with
+ * their ids.
+ */
+#define EVENTLOG_MAX_PROVIDERS 32
+struct eventlog_provider_info {
+ uint16_t provider_id;
+ char name[EVENTLOG_PROVIDER_NAME_MAX];
+} __packed;
+struct eventlog_get_providers_resp {
+ uint32_t count;
+ struct eventlog_provider_info providers[EVENTLOG_MAX_PROVIDERS];
+} __packed;
+
+/* IOCTL definitions */
+#define EVENTLOG_IOC_MAGIC 'E'
+#define EVENTLOG_IOCTL_CREATE_BASE \
+ _IOW(EVENTLOG_IOC_MAGIC, 1, struct eventlog_create_req)
+#define EVENTLOG_IOCTL_DESTROY _IO(EVENTLOG_IOC_MAGIC, 2)
+#define EVENTLOG_IOCTL_GET_STATS \
+ _IOR(EVENTLOG_IOC_MAGIC, 3, struct eventlog_stats)
+#define EVENTLOG_IOCTL_GET_PROVIDERS \
+ _IOR(EVENTLOG_IOC_MAGIC, 4, struct eventlog_get_providers_resp)
+
+#define EVENTLOG_IOCTL_CREATE_SIZE(count) \
+ _IOC_NEWLEN(EVENTLOG_IOCTL_CREATE_BASE, \
+ __builtin_offsetof(struct eventlog_create_req, subscriptions) + \
+ (count) * sizeof(struct eventlog_subscription_req))
+
+#ifdef _KERNEL
+
+#include <sys/conf.h>
+#include <sys/uio.h>
+
+/* Forward declarations */
+struct eventlog_subscriber;
+struct eventlog_subscription;
+
+/*
+ * Create a new device-based subscriber with per-CPU buffers.
+ * buffer_size_per_cpu: Size of buffer to allocate per CPU
+ * (EVENTLOG_BUFFER_SIZE_MIN to EVENTLOG_BUFFER_SIZE_MAX).
+ * The subscriber is automatically added to the global subscribers list.
+ * Returns NULL on failure, subscriber pointer on success.
+ */
+struct eventlog_subscriber *eventlog_subscriber_create_device(
+ uint32_t buffer_size_per_cpu);
+
+/*
+ * Callback function type for callback-based subscribers.
+ *
+ * The payload is delivered as a scatter/gather iovec; iovcnt == 1 for
+ * scalar writes and may be > 1 for variable-length events. Callbacks
+ * that need a flat payload compact the iov themselves. The iov and
+ * iov[*].iov_base pointers are only valid for the duration of the call.
+ *
+ * Parameters (in order):
+ * - hdr: Event header
+ * - provider_name: Provider name string
+ * - provider_name_len: Length of provider name (excluding null terminator)
+ * - session_id: Session ID (uint64_t, displayed as decimal)
+ * - iov, iovcnt: Payload segments. iovcnt == 0 means no payload.
+ * - payload_size: Sum of iov[*].iov_len (redundant, provided for ease)
+ * - callback_arg: User-provided callback argument
+ */
+typedef void (*eventlog_callback_t)(const struct eventlog_event_header *hdr,
+ const char *provider_name, uint8_t provider_name_len, uint64_t session_id,
+ const struct iovec *iov, int iovcnt, size_t payload_size,
+ void *callback_arg);
+
+/*
+ * Create a new callback-based subscriber.
+ * callback: Function to call when events arrive.
+ * callback_arg: Argument to pass to callback function.
+ * The subscriber is automatically added to the global subscribers list.
+ * Returns NULL on failure, subscriber pointer on success.
+ */
+struct eventlog_subscriber *eventlog_subscriber_create_callback(
+ eventlog_callback_t callback, void *callback_arg);
+
+/*
+ * Destroy a subscriber and update provider enablement.
+ * Removes all subscriptions, drains any in-flight dump_state tasks,
+ * and frees resources.
+ */
+void eventlog_subscriber_destroy(struct eventlog_subscriber *subscriber);
+
+/*
+ * Add a subscription to a subscriber. flags is a bitmask of
+ * EVENTLOG_SUBSCRIPTION_* values; unknown bits return EINVAL. Pass 0
+ * for no flags. Returns 0 on success, error code on failure.
+ */
+int eventlog_subscriber_add_subscription(struct eventlog_subscriber *subscriber,
+ const char *provider_name, enum eventlog_level level, uint32_t keywords,
+ uint32_t flags);
+
+/*
+ * Wait for every dump_state task this subscriber has outstanding
+ * (queued or running) to finish. Safe to call from any sleepable
+ * context.
+ */
+void eventlog_subscriber_drain_dumps(struct eventlog_subscriber *subscriber);
+
+/*
+ * Read events from a device subscriber's buffer.
+ * Handles both user-space (UIO_USERSPACE) and kernel (UIO_SYSSPACE) uio.
+ *
+ * Parameters:
+ * - subscriber: The subscriber to read from
+ * - uio: Scatter/gather I/O structure (must have uio_td set for user space)
+ * - flags: Read flags (e.g. FNONBLOCK for non-blocking)
+ *
+ * Returns 0 on success, or an error code on failure.
+ */
+int eventlog_subscriber_read(struct eventlog_subscriber *subscriber,
+ struct uio *uio, int flags);
+
+/*
+ * Query subscriber statistics.
+ * Fills stats with current values (e.g. dropped_events).
+ */
+void eventlog_subscriber_get_stats(struct eventlog_subscriber *subscriber,
+ struct eventlog_stats *stats);
+
+#endif /* _KERNEL */
+
+#endif /* _SYS_EVENTLOG_SUBSCRIBER_H_ */
diff --git a/targets/pseudo/userland/Makefile.depend b/targets/pseudo/userland/Makefile.depend
--- a/targets/pseudo/userland/Makefile.depend
+++ b/targets/pseudo/userland/Makefile.depend
@@ -16,6 +16,7 @@
bin/domainname \
bin/echo \
bin/ed \
+ bin/elog \
bin/expr \
bin/freebsd-version \
bin/getfacl \
diff --git a/tests/sys/kern/Makefile b/tests/sys/kern/Makefile
--- a/tests/sys/kern/Makefile
+++ b/tests/sys/kern/Makefile
@@ -146,6 +146,9 @@
CFLAGS.subr_unit.c+= -Wno-missing-prototypes
SRCS.subr_unit_test+= subr_unit.c
+ATF_TESTS_PYTEST+= kern_eventlog_test.py
+ATF_TESTS_PYTEST+= elog_test.py
+
WARNS?= 3
TESTS_SUBDIRS+= acct
diff --git a/tests/sys/kern/elog_test.py b/tests/sys/kern/elog_test.py
new file mode 100644
--- /dev/null
+++ b/tests/sys/kern/elog_test.py
@@ -0,0 +1,67 @@
+#
+# Copyright (c) 2026 Netflix, Inc.
+#
+# SPDX-License-Identifier: BSD-2-Clause
+#
+
+"""ATF tests for the elog(1) userspace utility.
+
+Smoke tests for the elog binary CLI surface. These catch packaging
+regressions (missing binary, broken option parser, broken capture-file
+reader) without requiring /dev/eventlog or any provider to be present.
+
+End-to-end coverage of the device interface and individual providers
+lives with the providers themselves; the framework's own kernel-side
+tests are in kern_eventlog_test.py.
+"""
+
+import subprocess
+from pathlib import Path
+
+import pytest
+from atf_python.utils import BaseTest
+
+ELOG = "/usr/bin/elog"
+
+
+class TestElogCli(BaseTest):
+ @pytest.mark.require_progs(["elog"])
+ def test_help(self):
+ # usage() in elog.c calls exit(1), so -h is expected to fail.
+ for flag in ("-h", "--help"):
+ r = subprocess.run(
+ [ELOG, flag], capture_output=True, text=True)
+ assert r.returncode == 1, f"elog {flag} returncode"
+ assert "usage: elog" in r.stderr, f"elog {flag} stderr"
+
+ @pytest.mark.require_progs(["elog"])
+ def test_no_args(self):
+ r = subprocess.run([ELOG], capture_output=True, text=True)
+ assert r.returncode == 1
+ assert "no subscriptions specified" in r.stderr
+
+ @pytest.mark.require_progs(["elog"])
+ def test_unknown_arg(self):
+ r = subprocess.run(
+ [ELOG, "--not-a-real-flag"], capture_output=True, text=True)
+ assert r.returncode == 1
+ assert "unknown argument" in r.stderr
+
+ @pytest.mark.require_progs(["elog"])
+ def test_read_missing_file(self, tmp_path):
+ target = tmp_path / "does-not-exist.elog"
+ r = subprocess.run(
+ [ELOG, "-r", str(target)], capture_output=True, text=True)
+ assert r.returncode == 1
+ assert "fopen" in r.stderr
+
+ @pytest.mark.require_progs(["elog"])
+ def test_read_invalid_magic(self, tmp_path):
+ # 64 zero bytes is enough to satisfy the initial header read but
+ # fails the ELOG_BINARY_MAGIC check.
+ bogus = tmp_path / "bogus.elog"
+ bogus.write_bytes(b"\0" * 64)
+ r = subprocess.run(
+ [ELOG, "-r", str(bogus)], capture_output=True, text=True)
+ assert r.returncode == 1
+ assert "bad magic number" in r.stderr
diff --git a/tests/sys/kern/kern_eventlog_test.py b/tests/sys/kern/kern_eventlog_test.py
new file mode 100644
--- /dev/null
+++ b/tests/sys/kern/kern_eventlog_test.py
@@ -0,0 +1,10 @@
+#
+# Copyright (c) 2026 Netflix, Inc.
+#
+# SPDX-License-Identifier: BSD-2-Clause
+#
+
+from atf_python.ktest import BaseKernelTest
+
+class TestKernEventlog(BaseKernelTest):
+ KTEST_MODULE_NAME = "ktest_eventlog"
diff --git a/usr.bin/Makefile b/usr.bin/Makefile
--- a/usr.bin/Makefile
+++ b/usr.bin/Makefile
@@ -35,6 +35,7 @@
du \
elfctl \
elfdump \
+ elog \
enigma \
env \
etdump \
diff --git a/usr.bin/elog/Makefile b/usr.bin/elog/Makefile
new file mode 100644
--- /dev/null
+++ b/usr.bin/elog/Makefile
@@ -0,0 +1,63 @@
+.include <src.opts.mk>
+
+PACKAGE=runtime
+PROG= elog
+MAN= elog.1
+LIBADD= z
+
+# Schema directory location
+EVENTLOG_SCHEMA_DIR= ${SRCTOP}/include/eventlog
+
+# Find all schema files (handle case where directory doesn't exist yet)
+EVENTLOG_SCHEMAS!= if [ -d ${EVENTLOG_SCHEMA_DIR} ]; then find ${EVENTLOG_SCHEMA_DIR} -name '*_eventlog_schema.src' 2>/dev/null | sed 's|.*/||' | sort; fi
+
+# Output directory for generated consumer headers
+# Use OBJTOP directly and append sys/include/eventlog
+# Ensure OBJTOP is treated as absolute path (it should be, but be explicit)
+EVENTLOG_HEADER_DIR= ${OBJTOP:tA}/sys/include/eventlog
+EVENTLOG_CONSUMER_HEADER= eventlog_consumer.h
+
+# Generate consumer headers for each schema and master header
+.if !empty(EVENTLOG_SCHEMAS)
+EVENTLOG_CONSUMER_HEADERS_GEN= ${EVENTLOG_HEADER_DIR}/.consumer_headers_generated
+CLEANFILES+= ${EVENTLOG_CONSUMER_HEADERS_GEN}
+# Generate headers immediately when Makefile is parsed (for early availability)
+.if !make(clean) && !make(cleandir) && !make(clobber)
+_GEN_CONSUMER_HEADERS!= ${.CURDIR}/gen_eventlog_headers.sh \
+ ${EVENTLOG_SCHEMA_DIR} \
+ ${OBJTOP:tA}/sys/include/eventlog \
+ ${SRCTOP} \
+ ${SRCTOP}/include/eventlog/eventlog_gen.awk \
+ ${OBJTOP:tA}/sys/include/eventlog/${EVENTLOG_CONSUMER_HEADER} || true; \
+ echo "consumer_headers_generated"
+.endif
+# Create Make targets as a safety net (immediate generation above should be sufficient)
+EVENTLOG_SCHEMA_DEPS!= if [ -d ${EVENTLOG_SCHEMA_DIR} ]; then find ${EVENTLOG_SCHEMA_DIR} -name '*_eventlog_schema.src' -type f 2>/dev/null | sort; fi
+${EVENTLOG_HEADER_DIR}/${EVENTLOG_CONSUMER_HEADER}: ${SRCTOP}/include/eventlog/eventlog_gen.awk ${.CURDIR}/gen_eventlog_headers.sh ${EVENTLOG_SCHEMA_DEPS}
+ ${.CURDIR}/gen_eventlog_headers.sh \
+ ${EVENTLOG_SCHEMA_DIR} \
+ ${EVENTLOG_HEADER_DIR} \
+ ${SRCTOP} \
+ ${SRCTOP}/include/eventlog/eventlog_gen.awk \
+ ${.TARGET}
+ @touch ${EVENTLOG_CONSUMER_HEADERS_GEN}
+
+CLEANFILES+= ${EVENTLOG_HEADER_DIR}/${EVENTLOG_CONSUMER_HEADER}
+clean-consumer-headers:
+ @if [ -d ${EVENTLOG_HEADER_DIR} ]; then \
+ for schema in ${EVENTLOG_SCHEMAS}; do \
+ provider=$$(awk '/^PROVIDER/ {print tolower($$2); exit}' ${EVENTLOG_SCHEMA_DIR}/$$schema 2>/dev/null || true); \
+ if [ -n "$$provider" ]; then \
+ rm -f ${EVENTLOG_HEADER_DIR}/$${provider}_eventlog_consumer.h; \
+ fi; \
+ done; \
+ fi
+
+CFLAGS+= -I${EVENTLOG_HEADER_DIR} -I${SRCTOP}/sys
+
+beforebuild: ${EVENTLOG_HEADER_DIR}/${EVENTLOG_CONSUMER_HEADER}
+elog.o: ${EVENTLOG_HEADER_DIR}/${EVENTLOG_CONSUMER_HEADER}
+.endif
+
+.include <bsd.prog.mk>
+
diff --git a/usr.bin/elog/elog.1 b/usr.bin/elog/elog.1
new file mode 100644
--- /dev/null
+++ b/usr.bin/elog/elog.1
@@ -0,0 +1,293 @@
+.\"
+.\" Copyright (c) 2026 Netflix, Inc.
+.\"
+.\" SPDX-License-Identifier: BSD-2-Clause
+.\"
+.Dd April 27, 2026
+.Dt ELOG 1
+.Os
+.Sh NAME
+.Nm elog
+.Nd subscribe to and read events from eventlog device
+.Sh SYNOPSIS
+.Nm
+.Op Fl b Ar size
+.Op Fl d
+.Op Fl D
+.Op Fl e
+.Op Fl f Ar type
+.Op Fl n
+.Op Fl p
+.Op Fl s
+.Op Fl t
+.Op Fl -delta-time
+.Op Fl -duration Ar seconds
+.Op Fl o Ar file | Cm dir= Ns Ar path
+.Op Fl c Ar provider Op Ar level Op Ar keywords
+.Op Fl c Ar provider Op Ar level Op Ar keywords
+.Op ...
+.Op Fl h
+.Nm
+.Fl r Ar file
+.Op Fl d
+.Op Fl e
+.Op Fl n
+.Op Fl p
+.Op Fl t
+.Op Fl -delta-time
+.Sh DESCRIPTION
+The
+.Nm
+utility subscribes to events from one or more eventlog providers and displays
+them in a formatted output on standard output or writes them to a file.
+.Pp
+The utility opens the single system-wide eventlog device at
+.Pa /dev/eventlog
+and sends subscription requests for the specified providers.
+.Pp
+The eventlog framework is host-global and is not exposed to jailed
+processes.
+The
+.Nm
+utility must be run from the host
+.Pq Va prison0 ;
+running it from inside a jail fails because
+.Xr open 2
+on
+.Pa /dev/eventlog
+returns
+.Er EPERM .
+.Pp
+The options are as follows:
+.Bl -tag -width indent
+.It Fl c Ar provider Op Ar level Op Ar keywords
+.It Fl -capture Ar provider Op Ar level Op Ar keywords
+Subscribe to events from the specified provider.
+.Bl -tag -width indent
+.It Ar provider
+The name of the eventlog provider.
+This argument is required.
+.It Ar level
+Optional log level.
+Can be specified as a case-insensitive string
+(NONE, ERROR, WARN, INFO, VERBOSE, TRACE)
+or as a numeric value (0\(en5).
+Defaults to VERBOSE if not specified.
+TRACE captures all events including internal algorithmic details.
+VERBOSE excludes TRACE-level events.
+.It Ar keywords
+Optional keyword filter.
+Can be specified as a hexadecimal mask (e.g., 0x3F) or as pipe-delimited
+keyword names (e.g., CC|RX|TX).
+Keyword names are provider-specific and case-insensitive.
+Defaults to 0xFFFFFFFF (all keywords) if not specified.
+.El
+.It Fl b Ar size
+.It Fl -buffer-size Ar size
+Set per-CPU buffer size.
+The size can be specified as bytes, or with K/M/G suffix for Kilobytes,
+Megabytes, or Gigabytes.
+Examples: 65536, 64K, 1M, 256K.
+Valid range: 4KB to 1GB per CPU.
+Default: 512KB.
+.It Fl d
+.It Fl -date
+Show the full date in timestamps.
+When enabled, timestamps are displayed in the format
+.Li YYYY-MM-DD HH:MM:SS.uuuuuu
+instead of the default
+.Li HH:MM:SS.uuuuuu .
+.It Fl D
+.It Fl -dump-state
+Request providers dump their current state when subscribing.
+This causes providers to emit initial state events (e.g., connection parameters,
+current congestion window) immediately upon subscription.
+.It Fl e
+.It Fl -event-name
+Show the event name (e.g., IN, OUT, SESSION_CREATE) in square brackets
+after the session ID in each output line.
+.It Fl n
+.It Fl -event-number
+Print a serial event number at the beginning of each output line.
+Events are numbered starting from 1.
+.It Fl o Ar file
+.It Fl -output Ar file
+Write output to the specified file in binary format.
+See
+.Xr elog 5
+for a description of the binary file format.
+Uses buffered I/O for efficiency.
+Standard output always uses formatted text.
+.It Fl o Cm dir= Ns Ar path
+Write one binary file per session under
+.Ar path .
+Each session gets its own file, named by session ID.
+The directory is created if it does not exist.
+.It Fl p
+.It Fl -providers
+Print all registered provider names to stderr at the beginning of output,
+before any events are displayed.
+.It Fl r Ar file
+.It Fl -read-binary Ar file
+Read a binary file created with
+.Fl o
+and convert it to formatted text output.
+See
+.Xr elog 5
+for a description of the binary file format.
+The binary file header includes total event count and dropped-event count.
+If the filename ends in
+.Pa .gz ,
+the file is transparently decompressed using zlib.
+This mode cannot be used with capture options.
+.It Fl s
+.It Fl -stats
+Print detailed statistics on exit, including provider count, the number
+of events received, and any dropped events.
+Statistics are printed to stderr.
+Without this flag, no statistics are printed.
+.It Fl t
+.It Fl -relative-time
+Show time relative to the first event in the trace.
+Each output line is prefixed with a relative timestamp in the format
+.Li +seconds.microseconds
+(e.g.,
+.Li +1.234567 ) .
+.It Fl -delta-time
+Show time elapsed since the previous event.
+Each output line is prefixed with a delta timestamp in the format
+.Li d Ns seconds.microseconds
+(e.g.,
+.Li d0.000015 ) .
+Can be combined with
+.Fl t .
+.It Fl -duration Ar seconds
+Self-exit after
+.Ar seconds
+seconds.
+.Nm
+raises
+.Dv SIGALRM
+internally and takes the same cleanup path as
+.Dv SIGINT
+or
+.Dv SIGTERM :
+binary output is flushed, the file header is updated with final
+event/drop counts, and per-session files are closed and renamed.
+A value of 0 disables the timer (the default), in which case
+.Nm
+runs until a signal is received.
+Intended for scripted captures
+.Pq Xr oca.py 1 Cm get Cm capture , cron jobs, ...
+that want a fixed recording window without relying on an external
+.Xr kill 1 .
+.It Fl h
+.It Fl -help
+Display usage information and exit.
+.El
+.Pp
+Multiple provider subscriptions can be specified by using multiple
+.Fl c
+flags.
+.Pp
+The utility reads events continuously, blocking when no data is available.
+Events are displayed with timestamp, CPU, thread ID, provider name, session ID,
+and formatted event data.
+When
+.Fl n , Fl t ,
+or
+.Fl -delta-time
+are enabled, their respective prefixes appear at the beginning of each line
+before the CPU and thread fields.
+.Pp
+Statistics are only printed when the
+.Fl s
+flag is specified.
+When enabled, statistics include the number of providers, events received,
+and any dropped events.
+.Sh EXIT STATUS
+.Ex -std
+.Sh EXAMPLES
+Subscribe to all events from a provider at VERBOSE level with all keywords:
+.Bd -literal -offset indent
+elog -c provider
+.Ed
+.Pp
+Subscribe to a provider at INFO level with all keywords:
+.Bd -literal -offset indent
+elog -c provider INFO
+.Ed
+.Pp
+Subscribe to a provider at INFO level with specific keywords by name:
+.Bd -literal -offset indent
+elog -c provider INFO KEYWORD1|KEYWORD2
+.Ed
+.Pp
+Subscribe to a provider at INFO level with keywords as hex mask:
+.Bd -literal -offset indent
+elog -c provider INFO 0x3F
+.Ed
+.Pp
+Subscribe to a provider at TRACE level:
+.Bd -literal -offset indent
+elog -c provider TRACE
+.Ed
+.Pp
+Subscribe to multiple providers:
+.Bd -literal -offset indent
+elog -c provider1 INFO -c provider2 WARN
+.Ed
+.Pp
+Subscribe using numeric levels:
+.Bd -literal -offset indent
+elog -c provider 3 0x3F
+.Ed
+.Pp
+Set buffer size to 4MB per CPU:
+.Bd -literal -offset indent
+elog -b 4M -c provider
+.Ed
+.Pp
+Write output to a file (binary format):
+.Bd -literal -offset indent
+elog -c provider -o /tmp/events.bin
+.Ed
+.Pp
+Read binary file and convert to text:
+.Bd -literal -offset indent
+elog -r /tmp/events.bin
+.Ed
+.Pp
+Show relative timestamps from trace start:
+.Bd -literal -offset indent
+elog -t -c provider
+.Ed
+.Pp
+Show both relative and inter-event delta timestamps:
+.Bd -literal -offset indent
+elog -t --delta-time -c provider
+.Ed
+.Pp
+Write per-session files into a directory:
+.Bd -literal -offset indent
+elog -o dir=/tmp/traces -c provider
+.Ed
+.Pp
+Print statistics on exit:
+.Bd -literal -offset indent
+elog -s -c provider
+.Ed
+.Pp
+Run a fixed-duration capture, suitable for scripted use:
+.Bd -literal -offset indent
+elog --duration 30 -c provider1 -c provider2 -o /tmp/cap.elog
+.Ed
+.Sh SEE ALSO
+.Xr elog 5 ,
+.Xr eventlog 9
+.Sh HISTORY
+The
+.Nm
+utility first appeared in
+.Fx 16.0 .
+
diff --git a/usr.bin/elog/elog.c b/usr.bin/elog/elog.c
new file mode 100644
--- /dev/null
+++ b/usr.bin/elog/elog.c
@@ -0,0 +1,1278 @@
+/*
+ * Copyright (c) 2026 Netflix, Inc.
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/eventlog.h>
+#include <sys/eventlog_subscriber.h>
+#include <sys/queue.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/ioccom.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <err.h>
+#include <errno.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <sys/time.h>
+#include <time.h>
+#include <signal.h>
+#include <zlib.h>
+
+/* Include consumer header for formatting events */
+/* Generated headers are in the build directory */
+#include "eventlog_consumer.h"
+
+struct subscription {
+ char provider_name[EVENTLOG_PROVIDER_NAME_MAX];
+ enum eventlog_level level;
+ uint32_t keywords;
+};
+
+static struct subscription *subscriptions = NULL;
+static int subscription_count = 0;
+static int subscription_capacity = 0;
+static uint32_t buffer_size_per_cpu = 512 * 1024; /* Default 512K */
+/* When > 0, exit cleanly via SIGALRM after this many seconds. */
+static unsigned int duration_sec = 0;
+static volatile bool done = false;
+static int eventlog_fd = -1; /* eventlog device fd (for stats) */
+static volatile bool stats_printed = false;
+static uint64_t events_received = 0;
+static bool verbose_stats = false; /* Print detailed stats on exit */
+static const char *binary_input_file = NULL;
+static uint64_t last_dropped_events = 0; /* From last GET_STATS */
+
+/* For read mode: base timestamps from file header for UTC calculation */
+static uint64_t read_capture_start = 0;
+static uint64_t read_start_utc_us = 0;
+static bool show_date = false; /* Full date in timestamps */
+static bool show_event_number = false; /* Print serial number per line */
+static bool show_providers = false; /* Print provider names at start */
+static bool show_event_name = false; /* Print event name after sid */
+static bool show_relative_time = false; /* Time relative to first event */
+static bool show_delta_time = false; /* Time since previous event */
+static uint64_t first_event_ts = 0;
+static uint64_t prev_event_ts = 0;
+static bool dump_state = false; /* Replay current state on subscribe */
+static char *output_dir = NULL; /* If set, one file per session */
+
+/* Per-session file state for -o dir= mode */
+struct session_file {
+ STAILQ_ENTRY(session_file) link;
+ char *session_id;
+ char *filepath;
+ FILE *fp;
+ bool header_written;
+ uint64_t capture_start;
+ uint64_t start_utc_us;
+ uint64_t event_count;
+};
+static STAILQ_HEAD(, session_file) session_files =
+ STAILQ_HEAD_INITIALIZER(session_files);
+/* Binary output state for single-file mode. */
+static struct session_file single_output;
+
+static inline bool
+binary_output_mode(void)
+{
+ return (output_dir != NULL || single_output.fp != NULL);
+}
+
+/* Provider id->name map (from GET_PROVIDERS or file header) */
+static struct eventlog_provider_info provider_map[EVENTLOG_MAX_PROVIDERS];
+static uint32_t provider_map_count = 0;
+
+static void
+print_provider_names(void)
+{
+ if (!show_providers || provider_map_count == 0)
+ return;
+ fprintf(stderr, "[Providers] %u registered:", provider_map_count);
+ for (uint32_t i = 0; i < provider_map_count; i++)
+ fprintf(stderr, " %s", provider_map[i].name);
+ fprintf(stderr, "\n");
+}
+
+static const char *
+get_provider_name(uint16_t id)
+{
+ for (uint32_t i = 0; i < provider_map_count; i++) {
+ if (provider_map[i].provider_id == id)
+ return (provider_map[i].name);
+ }
+ return ("?");
+}
+
+/* Binary file format structures */
+#define ELOG_BINARY_MAGIC "ELOG"
+#define ELOG_BINARY_VERSION 1
+
+struct elog_binary_header {
+ char magic[4]; /* "ELOG" */
+ uint32_t version; /* File format version */
+ uint64_t capture_start; /* us since boot at capture start */
+ uint64_t start_utc_us; /* UTC us at capture start */
+ uint64_t event_count; /* Total events in file */
+ uint64_t dropped_events;
+} __packed;
+
+static void
+usage(void)
+{
+ fprintf(stderr,
+"usage: elog [options]\n"
+" -c, --capture <provider> [level] [keywords]\n"
+" Capture events from provider\n"
+" provider: Provider name\n"
+" level: NONE/0, ERROR/1, WARN/2, INFO/3,\n"
+" VERBOSE/4, TRACE/5 (default: VERBOSE)\n"
+" keywords: Hex (0x3F) or names (CC|RX|TX)\n"
+" (default: 0xFFFFFFFF, all flags)\n"
+" -b, --buffer-size <size> Set per-CPU buffer size (default: 512K)\n"
+" Size in bytes or with K/M/G suffix\n"
+" Valid range: %uKB to %uMB per CPU\n"
+" --duration <sec> Self-exit after <sec> seconds (SIGALRM).\n"
+" Same cleanup as SIGINT/SIGTERM. 0 = no timeout.\n"
+" -d, --date Show full date (YYYY-MM-DD) in timestamps\n"
+" -e, --event-name Show event name after session ID\n"
+" -n, --event-number Print event serial number per line\n"
+" -p, --providers Print provider names at start of output\n"
+" -s, --stats Print detailed statistics on exit\n"
+" -o, --output <file> Write binary output to file (default: stdout)\n"
+" -o dir=<path> Write one binary file per session under <path>\n"
+" -r, --read-binary <file>\n"
+" Read binary file and convert to text (.gz ok)\n"
+" -t, --relative-time Show time relative to first event\n"
+" --delta-time Show time since previous event\n"
+" -D, --dump-state Request providers to replay current state\n"
+"\n"
+" Multiple captures can be specified:\n"
+" elog -c provider\n"
+" elog -c provider INFO\n"
+" elog -c provider INFO 0x3F\n"
+" elog -c provider1 -c provider2 WARN\n"
+" elog -c provider -o /tmp/events.bin\n"
+" elog -r /tmp/events.bin\n",
+ EVENTLOG_BUFFER_SIZE_MIN / 1024,
+ EVENTLOG_BUFFER_SIZE_MAX / (1024 * 1024));
+ exit(1);
+}
+
+static bool
+try_parse_level(const char *str, enum eventlog_level *out)
+{
+ long num;
+ char *endptr;
+
+ static const struct {
+ const char *name;
+ enum eventlog_level level;
+ } levels[] = {
+ { "NONE", EVENTLOG_LEVEL_NONE },
+ { "ERROR", EVENTLOG_LEVEL_ERROR },
+ { "WARN", EVENTLOG_LEVEL_WARN },
+ { "INFO", EVENTLOG_LEVEL_INFO },
+ { "VERBOSE", EVENTLOG_LEVEL_VERBOSE },
+ { "TRACE", EVENTLOG_LEVEL_TRACE },
+ };
+
+ for (size_t i = 0; i < nitems(levels); i++) {
+ if (strcasecmp(str, levels[i].name) == 0) {
+ *out = levels[i].level;
+ return (true);
+ }
+ }
+
+ num = strtol(str, &endptr, 10);
+ if (*endptr == '\0' && num >= EVENTLOG_LEVEL_NONE &&
+ num <= EVENTLOG_LEVEL_TRACE) {
+ *out = (enum eventlog_level)num;
+ return (true);
+ }
+ return (false);
+}
+
+static bool
+try_parse_keywords(const char *provider, const char *str, uint32_t *out)
+{
+ char *copy, *token, *saveptr;
+ uint32_t result, kw;
+
+ if (strncmp(str, "0x", 2) == 0 || strncmp(str, "0X", 2) == 0) {
+ *out = (uint32_t)strtoul(str, NULL, 0);
+ return (true);
+ }
+
+ copy = strdup(str);
+ if (copy == NULL)
+ return (false);
+
+ result = 0;
+ token = strtok_r(copy, "|", &saveptr);
+ while (token != NULL) {
+ kw = eventlog_keyword_from_string(provider, token);
+ if (kw == 0) {
+ free(copy);
+ return (false);
+ }
+ result |= kw;
+ token = strtok_r(NULL, "|", &saveptr);
+ }
+ free(copy);
+
+ if (result == 0)
+ return (false);
+
+ *out = result;
+ return (true);
+}
+
+static size_t
+parse_size(const char *size_str)
+{
+ char *endptr;
+ unsigned long long size;
+ char unit;
+
+ size = strtoull(size_str, &endptr, 0);
+ if (endptr == size_str)
+ errx(1, "invalid buffer size: %s", size_str);
+
+ /* Skip whitespace */
+ while (*endptr == ' ' || *endptr == '\t')
+ endptr++;
+
+ /* Check for unit suffix */
+ unit = *endptr;
+ if (unit != '\0') {
+ endptr++; /* Skip the unit character */
+ /* Check for any remaining characters */
+ while (*endptr == ' ' || *endptr == '\t')
+ endptr++;
+ if (*endptr != '\0')
+ errx(1,
+ "invalid buffer size: trailing characters after unit");
+
+ switch (unit) {
+ case 'K':
+ case 'k':
+ size *= 1024;
+ break;
+ case 'M':
+ case 'm':
+ size *= 1024 * 1024;
+ break;
+ case 'G':
+ case 'g':
+ size *= 1024 * 1024 * 1024;
+ break;
+ default:
+ errx(1, "invalid buffer size unit: %c (use K, M, or G)",
+ unit);
+ }
+ }
+
+ if (size < EVENTLOG_BUFFER_SIZE_MIN)
+ errx(1, "buffer size too small: minimum is %u bytes",
+ EVENTLOG_BUFFER_SIZE_MIN);
+ if (size > EVENTLOG_BUFFER_SIZE_MAX)
+ errx(1, "buffer size too large: maximum is %u bytes",
+ EVENTLOG_BUFFER_SIZE_MAX);
+
+ return ((size_t)size);
+}
+
+/*
+ * Format timestamp. If base_ts and base_utc_us are set (e.g. from file header),
+ * computes UTC. With show_date, formats as YYYY-MM-DD HH:MM:SS.uuuuuu;
+ * otherwise just HH:MM:SS.uuuuuu. Falls back to uptime HH:MM:SS.uuuuuu.
+ */
+static void
+format_timestamp(uint64_t us, char *buf, size_t bufsize,
+ uint64_t base_ts, uint64_t base_utc_us)
+{
+ if (base_utc_us != 0) {
+ int64_t delta = (int64_t)us - (int64_t)base_ts;
+ uint64_t utc_us = (uint64_t)((int64_t)base_utc_us + delta);
+ time_t sec = (time_t)(utc_us / 1000000);
+ unsigned long usec = (unsigned long)(utc_us % 1000000);
+ struct tm *tm = gmtime(&sec);
+ if (tm != NULL) {
+ if (show_date)
+ snprintf(buf, bufsize,
+ "%04d-%02d-%02d %02d:%02d:%02d.%06lu",
+ tm->tm_year + 1900, tm->tm_mon + 1,
+ tm->tm_mday, tm->tm_hour, tm->tm_min,
+ tm->tm_sec, usec);
+ else
+ snprintf(buf, bufsize, "%02d:%02d:%02d.%06lu",
+ tm->tm_hour, tm->tm_min, tm->tm_sec,
+ usec);
+ return;
+ }
+ }
+ /* Fallback: uptime format */
+ {
+ uint64_t seconds = us / 1000000;
+ uint64_t microseconds = us % 1000000;
+ uint64_t hours = seconds / 3600;
+ uint64_t minutes = (seconds % 3600) / 60;
+ uint64_t secs = seconds % 60;
+ snprintf(buf, bufsize, "%02llu:%02llu:%02llu.%06llu",
+ (unsigned long long)hours,
+ (unsigned long long)minutes,
+ (unsigned long long)secs,
+ (unsigned long long)microseconds);
+ }
+}
+
+/* Forward declarations */
+static void write_binary_header_to_file(FILE *fp, uint64_t capture_start_time,
+ uint64_t start_utc_time_us);
+static size_t parse_and_print_events(const unsigned char *data, size_t len);
+
+/*
+ * Get or create the output file for a session when using -o dir= mode.
+ * Returns NULL if output_dir is not set (single-file mode).
+ */
+#define SESSION_ID_STR_MAX 32
+
+static FILE *
+get_session_output_file(const char *session_id)
+{
+ struct session_file *sf;
+ char sanitized[SESSION_ID_STR_MAX];
+ char fullpath[PATH_MAX];
+ size_t i, j;
+
+ if (output_dir == NULL)
+ return (NULL);
+
+ STAILQ_FOREACH(sf, &session_files, link) {
+ if (strcmp(sf->session_id, session_id) == 0)
+ return (sf->fp);
+ }
+
+ for (i = 0, j = 0;
+ session_id[i] != '\0' && j < (sizeof(sanitized) - 1); i++) {
+ char c = session_id[i];
+ if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
+ (c >= '0' && c <= '9') || c == '-' || c == '_')
+ sanitized[j++] = c;
+ else if (c == '/' || c == '\\')
+ sanitized[j++] = '_';
+ }
+ sanitized[j] = '\0';
+ if (j == 0)
+ snprintf(sanitized, sizeof(sanitized), "global");
+
+ sf = malloc(sizeof(*sf));
+ if (sf == NULL)
+ err(1, "malloc(session_file)");
+ sf->session_id = strdup(session_id);
+ if (sf->session_id == NULL)
+ err(1, "strdup");
+ if (snprintf(fullpath, sizeof(fullpath), "%s/%s.elog", output_dir,
+ sanitized) >= (int)sizeof(fullpath))
+ errx(1, "path too long");
+ sf->filepath = strdup(fullpath);
+ if (sf->filepath == NULL)
+ err(1, "strdup");
+ sf->fp = fopen(fullpath, "wb");
+ if (sf->fp == NULL)
+ err(1, "fopen(%s)", fullpath);
+ sf->header_written = false;
+ sf->capture_start = 0;
+ sf->event_count = 0;
+ STAILQ_INSERT_TAIL(&session_files, sf, link);
+ return (sf->fp);
+}
+
+/*
+ * Find session_file by session_id (for updating header).
+ */
+static struct session_file *
+find_session_file(const char *session_id)
+{
+ struct session_file *sf;
+
+ STAILQ_FOREACH(sf, &session_files, link) {
+ if (strcmp(sf->session_id, session_id) == 0)
+ return (sf);
+ }
+ return (NULL);
+}
+
+static void
+init_binary_header(struct elog_binary_header *hdr, uint64_t capture_start,
+ uint64_t start_utc_us, uint64_t event_count, uint64_t dropped_events)
+{
+ memcpy(hdr->magic, ELOG_BINARY_MAGIC, 4);
+ hdr->version = ELOG_BINARY_VERSION;
+ hdr->capture_start = capture_start;
+ hdr->start_utc_us = start_utc_us;
+ hdr->event_count = event_count;
+ hdr->dropped_events = dropped_events;
+}
+
+static void
+rewrite_binary_header(FILE *fp, uint64_t capture_start,
+ uint64_t start_utc_us, uint64_t event_count, uint64_t dropped_events)
+{
+ struct elog_binary_header hdr;
+ init_binary_header(&hdr, capture_start, start_utc_us,
+ event_count, dropped_events);
+ if (fseek(fp, 0, SEEK_SET) != 0)
+ err(1, "fseek");
+ if (fwrite(&hdr, sizeof(hdr), 1, fp) != 1)
+ err(1, "fwrite(binary header)");
+}
+
+static void
+close_session_file(const char *session_id)
+{
+ struct session_file *sf, *sf_next;
+
+ for (sf = STAILQ_FIRST(&session_files); sf != NULL; sf = sf_next) {
+ sf_next = STAILQ_NEXT(sf, link);
+ if (strcmp(sf->session_id, session_id) == 0) {
+ if (sf->header_written)
+ rewrite_binary_header(sf->fp, sf->capture_start,
+ sf->start_utc_us, sf->event_count, 0);
+ fflush(sf->fp);
+ fclose(sf->fp);
+ sf->fp = NULL;
+ STAILQ_REMOVE(&session_files, sf, session_file, link);
+ free(sf->filepath);
+ free(sf->session_id);
+ free(sf);
+ return;
+ }
+ }
+}
+
+static void
+write_binary_header_to_file(FILE *fp, uint64_t capture_start_time,
+ uint64_t start_utc_time_us)
+{
+ struct elog_binary_header hdr;
+ uint32_t i;
+
+ init_binary_header(&hdr, capture_start_time, start_utc_time_us, 0, 0);
+ if (fwrite(&hdr, sizeof(hdr), 1, fp) != 1)
+ err(1, "fwrite(binary header)");
+ if (fwrite(&provider_map_count, sizeof(provider_map_count), 1, fp) != 1)
+ err(1, "fwrite(provider count)");
+ for (i = 0; i < provider_map_count; i++) {
+ if (fwrite(&provider_map[i], sizeof(provider_map[i]), 1, fp)
+ != 1)
+ err(1, "fwrite(provider)");
+ }
+}
+
+/*
+ * Format and print an eventlog event.
+ */
+static void
+write_binary_event(const struct eventlog_event_header *hdr,
+ const void *payload, size_t payload_size)
+{
+ char session_id_str[SESSION_ID_STR_MAX];
+ FILE *out_fp;
+ struct session_file *sf;
+ size_t event_length;
+
+ snprintf(session_id_str, sizeof(session_id_str), "%lu",
+ (unsigned long)hdr->session_id);
+
+ if (output_dir != NULL) {
+ out_fp = get_session_output_file(session_id_str);
+ sf = find_session_file(session_id_str);
+ } else {
+ out_fp = single_output.fp;
+ sf = &single_output;
+ }
+
+ if (sf != NULL) {
+ if (!sf->header_written) {
+ struct timeval tv;
+ gettimeofday(&tv, NULL);
+ uint64_t utc_us = (uint64_t)tv.tv_sec * 1000000 +
+ tv.tv_usec;
+ write_binary_header_to_file(out_fp, hdr->timestamp,
+ utc_us);
+ sf->header_written = true;
+ sf->capture_start = hdr->timestamp;
+ sf->start_utc_us = utc_us;
+ }
+ sf->event_count++;
+ }
+
+ event_length = sizeof(struct eventlog_event_header) + payload_size;
+ if (event_length > UINT16_MAX)
+ errx(1, "Event too large for binary format: %zu bytes",
+ event_length);
+
+ struct eventlog_event_header hdr_copy = *hdr;
+ hdr_copy.event_length = (uint16_t)event_length;
+ if (fwrite(&hdr_copy, sizeof(struct eventlog_event_header), 1, out_fp)
+ != 1)
+ err(1, "fwrite(event header)");
+ if (payload_size > 0 && fwrite(payload, payload_size, 1, out_fp) != 1)
+ err(1, "fwrite(payload)");
+
+ if (output_dir != NULL) {
+ if (eventlog_is_session_end(NULL, hdr->event_id))
+ close_session_file(session_id_str);
+ }
+}
+
+static void
+print_eventlog_event(const struct eventlog_event_header *hdr,
+ const void *payload, size_t payload_size)
+{
+ char log_line[2048];
+ char formatted_buf[1024];
+ char session_id_str[SESSION_ID_STR_MAX];
+ char timestamp_str[32];
+ char event_name_buf[64];
+ char event_num_buf[32];
+ char relative_buf[32];
+ char delta_buf[32];
+ const char *provider_name;
+ int formatted_len;
+
+ if (binary_output_mode()) {
+ write_binary_event(hdr, payload, payload_size);
+ return;
+ }
+
+ snprintf(session_id_str, sizeof(session_id_str), "%lu",
+ (unsigned long)hdr->session_id);
+ provider_name = get_provider_name(hdr->provider_id);
+
+ format_timestamp(hdr->timestamp, timestamp_str, sizeof(timestamp_str),
+ read_capture_start, read_start_utc_us);
+
+ relative_buf[0] = '\0';
+ if (show_relative_time) {
+ if (first_event_ts == 0)
+ first_event_ts = hdr->timestamp;
+ uint64_t rel = hdr->timestamp - first_event_ts;
+ snprintf(relative_buf, sizeof(relative_buf),
+ "+%llu.%06llu ",
+ (unsigned long long)(rel / 1000000),
+ (unsigned long long)(rel % 1000000));
+ }
+
+ delta_buf[0] = '\0';
+ if (show_delta_time) {
+ uint64_t delta = 0;
+ if (prev_event_ts != 0)
+ delta = hdr->timestamp - prev_event_ts;
+ snprintf(delta_buf, sizeof(delta_buf),
+ "d%llu.%06llu ",
+ (unsigned long long)(delta / 1000000),
+ (unsigned long long)(delta % 1000000));
+ }
+ prev_event_ts = hdr->timestamp;
+
+ formatted_len = eventlog_format_payload(
+ provider_name, payload, payload_size,
+ hdr->event_id, formatted_buf, sizeof(formatted_buf));
+ if (formatted_len <= 0)
+ snprintf(formatted_buf, sizeof(formatted_buf),
+ "[UNKNOWN_EVENT_ID:%u]", hdr->event_id);
+
+ event_num_buf[0] = '\0';
+ if (show_event_number)
+ snprintf(event_num_buf, sizeof(event_num_buf),
+ "%-8llu ", (unsigned long long)(events_received + 1));
+
+ event_name_buf[0] = '\0';
+ if (show_event_name) {
+ const char *name = eventlog_event_id_to_name(
+ provider_name, hdr->event_id);
+ if (name != NULL)
+ snprintf(event_name_buf, sizeof(event_name_buf),
+ "[%s]", name);
+ else
+ snprintf(event_name_buf, sizeof(event_name_buf),
+ "[?%u]", hdr->event_id);
+ }
+
+ snprintf(log_line, sizeof(log_line),
+ "%s%s%s[%2u]%04x::%s [%s][%s]%s %s\n",
+ event_num_buf,
+ relative_buf,
+ delta_buf,
+ hdr->cpu,
+ (unsigned int)hdr->thread_id,
+ timestamp_str,
+ provider_name,
+ session_id_str,
+ event_name_buf,
+ formatted_buf);
+
+ fputs(log_line, stdout);
+}
+
+static void
+update_binary_header(void)
+{
+ struct session_file *sf;
+
+ if (output_dir != NULL) {
+ STAILQ_FOREACH(sf, &session_files, link) {
+ if (sf->header_written)
+ rewrite_binary_header(sf->fp, sf->capture_start,
+ sf->start_utc_us, sf->event_count, 0);
+ }
+ return;
+ }
+
+ if (!single_output.header_written)
+ return;
+ rewrite_binary_header(single_output.fp, single_output.capture_start,
+ single_output.start_utc_us, events_received, last_dropped_events);
+}
+
+static bool
+has_gz_extension(const char *filename)
+{
+ size_t len = strlen(filename);
+ return (len >= 3 && strcmp(filename + len - 3, ".gz") == 0);
+}
+
+/*
+ * Thin wrappers to abstract FILE vs gzFile for the read path.
+ */
+struct elog_reader {
+ FILE *fp;
+ gzFile gz;
+ bool is_gz;
+};
+
+static void
+elog_reader_open(struct elog_reader *r, const char *filename)
+{
+ r->is_gz = has_gz_extension(filename);
+ if (r->is_gz) {
+ r->fp = NULL;
+ r->gz = gzopen(filename, "rb");
+ if (r->gz == NULL)
+ err(1, "gzopen(%s)", filename);
+ } else {
+ r->gz = NULL;
+ r->fp = fopen(filename, "rb");
+ if (r->fp == NULL)
+ err(1, "fopen(%s)", filename);
+ }
+}
+
+static ssize_t
+elog_reader_read(struct elog_reader *r, void *buf, size_t len)
+{
+ if (r->is_gz) {
+ int ret = gzread(r->gz, buf, (unsigned)len);
+ if (ret < 0) {
+ int errnum;
+ const char *msg = gzerror(r->gz, &errnum);
+ errx(1, "gzread: %s", msg);
+ }
+ return ((ssize_t)ret);
+ }
+ return ((ssize_t)fread(buf, 1, len, r->fp));
+}
+
+static bool
+elog_reader_eof(struct elog_reader *r)
+{
+ if (r->is_gz)
+ return (gzeof(r->gz) != 0);
+ return (feof(r->fp) != 0);
+}
+
+static void
+elog_reader_close(struct elog_reader *r)
+{
+ if (r->is_gz)
+ gzclose(r->gz);
+ else
+ fclose(r->fp);
+}
+
+/*
+ * Read exactly 'len' bytes or fail. Returns true on success, false on EOF
+ * (partial read at end of file).
+ */
+static bool
+elog_reader_read_exact(struct elog_reader *r, void *buf, size_t len)
+{
+ size_t total = 0;
+
+ while (total < len) {
+ ssize_t n;
+
+ n = elog_reader_read(r, (char *)buf + total, len - total);
+ if (n <= 0) {
+ if (total == 0)
+ return (false);
+ errx(1,
+ "Unexpected end of file (read %zu of %zu bytes)",
+ total, len);
+ }
+ total += n;
+ }
+ return (true);
+}
+
+static int
+read_binary_file(const char *filename)
+{
+ struct elog_reader reader;
+ struct elog_binary_header file_hdr;
+ unsigned char *buffer = NULL;
+ unsigned char *partial_buffer = NULL;
+ size_t buffer_size = 64 * 1024; /* 64KB chunks */
+ size_t buffer_used = 0;
+ size_t buffer_capacity = buffer_size;
+ size_t partial_size = 0;
+ ssize_t nread;
+ size_t consumed;
+
+ elog_reader_open(&reader, filename);
+
+ /* Read and validate file header */
+ memset(&file_hdr, 0, sizeof(file_hdr));
+ if (!elog_reader_read_exact(&reader, &file_hdr, sizeof(file_hdr)))
+ errx(1, "File is empty");
+
+ /* Validate magic number */
+ if (memcmp(file_hdr.magic, ELOG_BINARY_MAGIC, 4) != 0)
+ errx(1, "Invalid binary file: bad magic number");
+
+ /* Validate version */
+ if (file_hdr.version != ELOG_BINARY_VERSION)
+ errx(1, "Unsupported file version: %u (expected %u)",
+ file_hdr.version, ELOG_BINARY_VERSION);
+
+ /*
+ * Stash for format_timestamp when printing
+ * (UTC = start_utc_us + (event_ts - capture_start)).
+ */
+ read_capture_start = file_hdr.capture_start;
+ read_start_utc_us = file_hdr.start_utc_us;
+
+ /* V2: read provider list for event lookup */
+ if (!elog_reader_read_exact(&reader, &provider_map_count,
+ sizeof(provider_map_count)))
+ err(1, "read(provider count)");
+ if (provider_map_count > EVENTLOG_MAX_PROVIDERS)
+ errx(1, "Invalid provider count %u in file",
+ provider_map_count);
+ for (uint32_t i = 0; i < provider_map_count; i++) {
+ if (!elog_reader_read_exact(&reader, &provider_map[i],
+ sizeof(provider_map[i])))
+ err(1, "read(provider)");
+ }
+
+ print_provider_names();
+
+ /* Allocate buffers for reading events */
+ buffer = malloc(buffer_capacity);
+ if (buffer == NULL)
+ err(1, "malloc(buffer)");
+ partial_buffer = malloc(buffer_capacity);
+ if (partial_buffer == NULL)
+ err(1, "malloc(partial_buffer)");
+
+ /*
+ * Read events in chunks and parse them using the same code as
+ * kernel events.
+ */
+ while (!elog_reader_eof(&reader)) {
+ /* If we have partial data from previous read, prepend it */
+ if (partial_size > 0) {
+ if (partial_size > buffer_capacity) {
+ /* Partial event too large; likely corrupt. */
+ errx(1,
+ "Partial event too large, file may be corrupted");
+ }
+ memcpy(buffer, partial_buffer, partial_size);
+ buffer_used = partial_size;
+ partial_size = 0;
+ } else {
+ buffer_used = 0;
+ }
+
+ /* Read more data into buffer */
+ nread = elog_reader_read(&reader, buffer + buffer_used,
+ buffer_capacity - buffer_used);
+ if (nread == 0 && elog_reader_eof(&reader)) {
+ /* EOF - parse any remaining data */
+ if (buffer_used > 0)
+ parse_and_print_events(buffer, buffer_used);
+ break;
+ }
+
+ buffer_used += nread;
+
+ /* Parse events - returns number of bytes consumed */
+ consumed = parse_and_print_events(buffer, buffer_used);
+
+ /* Move any remaining (partial) data to partial buffer */
+ if (consumed < buffer_used) {
+ size_t remaining = buffer_used - consumed;
+ if (remaining > buffer_capacity) {
+ /* This shouldn't happen, but handle it */
+ errx(1,
+ "remaining data larger than buffer capacity");
+ }
+ memcpy(partial_buffer, buffer + consumed, remaining);
+ partial_size = remaining;
+ }
+ buffer_used = 0;
+ }
+
+ free(buffer);
+ free(partial_buffer);
+ elog_reader_close(&reader);
+
+ return (0);
+}
+
+/*
+ * Parse and print eventlog data.
+ * Format (V1):
+ * Each event: eventlog_event_header (includes provider_id, session_id,
+ * event_id) + payload.
+ * event_length = sizeof(header) + payload_size
+ * Multiple events can be present in a single buffer read.
+ * Returns the number of bytes consumed (complete events processed).
+ * Requires provider_map to be populated (from GET_PROVIDERS or file header).
+ */
+static size_t
+parse_and_print_events(const unsigned char *data, size_t len)
+{
+ const unsigned char *buf = data;
+ const unsigned char *end = data + len;
+ const unsigned char *start = data;
+
+ while (buf < end) {
+ struct eventlog_event_header hdr;
+ size_t event_payload_len;
+ const unsigned char *event_start = buf;
+
+ if (buf + sizeof(struct eventlog_event_header) > end)
+ break;
+ memcpy(&hdr, buf, sizeof(struct eventlog_event_header));
+
+ if (hdr.event_length < sizeof(struct eventlog_event_header)) {
+ fprintf(stderr,
+ "Error: invalid event_length %u at offset %zu\n",
+ hdr.event_length, (size_t)(buf - start));
+ buf += sizeof(struct eventlog_event_header);
+ return (buf - start);
+ }
+ if (event_start + hdr.event_length > end)
+ break;
+
+ buf += sizeof(struct eventlog_event_header);
+ event_payload_len = hdr.event_length -
+ sizeof(struct eventlog_event_header);
+ print_eventlog_event(&hdr, buf, event_payload_len);
+ events_received++;
+ buf = event_start + hdr.event_length;
+ }
+ return (buf - start);
+}
+
+static void
+print_stats(void)
+{
+ /* Prevent double-printing */
+ if (stats_printed)
+ return;
+ stats_printed = true;
+
+ /* Always query kernel stats for binary header update */
+ if (eventlog_fd >= 0) {
+ struct eventlog_stats stats;
+ if (ioctl(eventlog_fd, EVENTLOG_IOCTL_GET_STATS, &stats) == 0)
+ last_dropped_events = stats.dropped_events;
+ }
+
+ if (!verbose_stats)
+ return;
+
+ fprintf(stderr, "\n[Stats]\n");
+ fprintf(stderr, " Providers: %u\n", provider_map_count);
+ fprintf(stderr, " Events received: %llu\n",
+ (unsigned long long)events_received);
+ if (last_dropped_events > 0)
+ fprintf(stderr, " Dropped events: %llu\n",
+ (unsigned long long)last_dropped_events);
+}
+
+static void
+sigint_handler(int sig __unused)
+{
+ struct session_file *sf;
+
+ done = true;
+ /* Stats will be printed by atexit handler or normal exit path */
+ /* Flush output before exit */
+ if (output_dir != NULL) {
+ STAILQ_FOREACH(sf, &session_files, link) {
+ if (sf->fp != NULL)
+ fflush(sf->fp);
+ }
+ } else if (single_output.fp != NULL) {
+ fflush(single_output.fp);
+ } else {
+ fflush(stdout);
+ }
+}
+
+/*
+ * Parse command line arguments and populate global state.
+ */
+static bool
+arg_match(const char *arg, const char *long_form, const char *short_form)
+{
+ return (strcmp(arg, long_form) == 0 ||
+ (short_form != NULL && strcmp(arg, short_form) == 0));
+}
+
+static void
+parse_arguments(int argc, char *argv[])
+{
+ int arg_idx;
+ const char *arg;
+
+ for (arg_idx = 1; arg_idx < argc; arg_idx++) {
+ arg = argv[arg_idx];
+ if (arg_match(arg, "--capture", "-c")) {
+ enum eventlog_level level = EVENTLOG_LEVEL_VERBOSE;
+ uint32_t keywords = 0xFFFFFFFF;
+ int next_idx = arg_idx + 1;
+
+ if (subscription_count >= subscription_capacity) {
+ int new_capacity = (subscription_capacity == 0)
+ ? 16 : subscription_capacity * 2;
+ struct subscription *new_subscriptions =
+ realloc(subscriptions, new_capacity *
+ sizeof(struct subscription));
+ if (new_subscriptions == NULL)
+ errx(1,
+ "failed to allocate subscriptions");
+ subscriptions = new_subscriptions;
+ subscription_capacity = new_capacity;
+ }
+
+ if (next_idx >= argc)
+ errx(1,
+ "--capture requires at least provider name");
+
+ if (strlen(argv[next_idx]) >=
+ EVENTLOG_PROVIDER_NAME_MAX)
+ errx(1, "provider name too long");
+
+ memset(&subscriptions[subscription_count], 0,
+ sizeof(subscriptions[0]));
+ strlcpy(subscriptions[subscription_count].provider_name,
+ argv[next_idx],
+ sizeof(subscriptions[0].provider_name));
+ next_idx++;
+
+ if (next_idx < argc &&
+ try_parse_level(argv[next_idx], &level))
+ next_idx++;
+
+ /*
+ * Optional keywords (hex 0x prefix or pipe-delimited
+ * names).
+ */
+ if (next_idx < argc &&
+ try_parse_keywords(
+ subscriptions[subscription_count].provider_name,
+ argv[next_idx], &keywords))
+ next_idx++;
+
+ /* Always include SESSION for lifecycle events. */
+ subscriptions[subscription_count].level = level;
+ subscriptions[subscription_count].keywords =
+ keywords | EVENTLOG_KEYWORD_SESSION;
+
+ arg_idx = next_idx - 1;
+ subscription_count++;
+ } else if (arg_match(arg, "--buffer-size", "-b")) {
+ int next_idx = arg_idx + 1;
+ if (next_idx >= argc)
+ errx(1, "--buffer-size requires a size value");
+ buffer_size_per_cpu = parse_size(argv[next_idx]);
+ arg_idx = next_idx;
+ } else if (arg_match(arg, "--duration", NULL)) {
+ int next_idx = arg_idx + 1;
+ char *endptr;
+ unsigned long val;
+
+ if (next_idx >= argc)
+ errx(1, "--duration requires a seconds value");
+ val = strtoul(argv[next_idx], &endptr, 10);
+ if (*argv[next_idx] == '\0' || *endptr != '\0')
+ errx(1, "--duration: not a number: %s",
+ argv[next_idx]);
+ if (val > UINT_MAX)
+ errx(1, "--duration: value too large");
+ duration_sec = (unsigned int)val;
+ arg_idx = next_idx;
+ } else if (arg_match(arg, "--date", "-d")) {
+ show_date = true;
+ } else if (arg_match(arg, "--event-name", "-e")) {
+ show_event_name = true;
+ } else if (arg_match(arg, "--event-number", "-n")) {
+ show_event_number = true;
+ } else if (arg_match(arg, "--providers", "-p")) {
+ show_providers = true;
+ } else if (arg_match(arg, "--stats", "-s")) {
+ verbose_stats = true;
+ } else if (arg_match(arg, "--output", "-o")) {
+ int next_idx = arg_idx + 1;
+ if (next_idx >= argc)
+ errx(1,
+ "--output requires a filename or dir=path");
+ if (strncmp(argv[next_idx], "dir=", 4) == 0) {
+ output_dir = strdup(argv[next_idx] + 4);
+ if (output_dir == NULL)
+ err(1, "strdup");
+ if (mkdir(output_dir, 0755) != 0 &&
+ errno != EEXIST)
+ err(1, "mkdir(%s)", output_dir);
+ } else {
+ single_output.fp = fopen(argv[next_idx], "wb");
+ if (single_output.fp == NULL)
+ err(1, "fopen(%s)", argv[next_idx]);
+ }
+ arg_idx = next_idx;
+ } else if (arg_match(arg, "--relative-time", "-t")) {
+ show_relative_time = true;
+ } else if (arg_match(arg, "--delta-time", NULL)) {
+ show_delta_time = true;
+ } else if (arg_match(arg, "--dump-state", "-D")) {
+ dump_state = true;
+ } else if (arg_match(arg, "--read-binary", "-r")) {
+ int next_idx = arg_idx + 1;
+ if (next_idx >= argc)
+ errx(1, "--read-binary requires a filename");
+ binary_input_file = argv[next_idx];
+ arg_idx = next_idx;
+ } else if (arg_match(arg, "--help", "-h")) {
+ usage();
+ } else {
+ errx(1, "unknown argument: %s (use --capture or -c)",
+ arg);
+ }
+ }
+}
+
+/*
+ * Run eventlog device mode - open device, create subscriber, and read events.
+ */
+static int
+run_eventlog_mode(void)
+{
+ int fd;
+ char device_path[] = "/dev/eventlog";
+ char *buffer;
+ ssize_t nread;
+ size_t bufsize = 1024 * 1024;
+ int i, error;
+
+ /* Open device */
+ fd = open(device_path, O_RDONLY);
+ if (fd < 0) {
+ err(1, "open(%s)", device_path);
+ }
+
+ /* Prepare CREATE request with buffer size and subscriptions */
+ /* Calculate exact size needed */
+ size_t base_offset = __builtin_offsetof(struct eventlog_create_req,
+ subscriptions);
+ size_t sub_size = sizeof(struct eventlog_subscription_req);
+ size_t req_size = base_offset + subscription_count * sub_size;
+ struct eventlog_create_req *req;
+ u_long ioctl_cmd;
+ size_t ioctl_size;
+
+ req = malloc(req_size);
+ if (req == NULL)
+ err(1, "malloc");
+
+ memset(req, 0, req_size);
+ req->buffer_size_per_cpu = buffer_size_per_cpu;
+ req->count = subscription_count;
+ for (i = 0; i < subscription_count; i++) {
+ strlcpy(req->subscriptions[i].provider_name,
+ subscriptions[i].provider_name,
+ sizeof(req->subscriptions[i].provider_name));
+ req->subscriptions[i].level = subscriptions[i].level;
+ req->subscriptions[i].keywords = subscriptions[i].keywords;
+ req->subscriptions[i].flags = dump_state ?
+ EVENTLOG_SUBSCRIPTION_DUMP_STATE : 0;
+ }
+
+ /* Calculate ioctl command with exact size */
+ ioctl_cmd = EVENTLOG_IOCTL_CREATE_SIZE(subscription_count);
+ ioctl_size = ((ioctl_cmd >> 16) & 0x1fff); /* Extract IOCPARM_LEN */
+
+ /* Verify sizes match */
+ if (ioctl_size != req_size) {
+ errx(1, "ioctl size calculation error");
+ }
+
+ /* Send CREATE ioctl (creates subscriber and subscribes) */
+ error = ioctl(fd, ioctl_cmd, req);
+ if (error != 0) {
+ err(1, "ioctl(EVENTLOG_IOCTL_CREATE)");
+ }
+
+ free(req);
+
+ /* Get provider ids for event lookup (required for new binary format) */
+ {
+ struct eventlog_get_providers_resp prov_resp;
+ memset(&prov_resp, 0, sizeof(prov_resp));
+ error = ioctl(fd, EVENTLOG_IOCTL_GET_PROVIDERS, &prov_resp);
+ if (error != 0)
+ err(1, "ioctl(EVENTLOG_IOCTL_GET_PROVIDERS)");
+ provider_map_count = prov_resp.count;
+ memcpy(provider_map, prov_resp.providers,
+ provider_map_count * sizeof(provider_map[0]));
+ }
+
+ print_provider_names();
+
+
+ /* Allocate buffer */
+ buffer = malloc(bufsize);
+ if (buffer == NULL)
+ err(1, "malloc");
+
+ eventlog_fd = fd; /* Store for signal handler */
+
+ /* Read and parse events */
+ while (!done) {
+ nread = read(fd, buffer, bufsize);
+ if (nread < 0) {
+ if (errno == EINTR) {
+ /* Check if we were interrupted by signal */
+ if (done)
+ break;
+ continue;
+ }
+ if (errno == EAGAIN)
+ continue;
+ err(1, "read");
+ }
+ if (nread == 0) {
+ /* EOF - wait a bit and retry */
+ usleep(100000); /* 100ms */
+ continue;
+ }
+
+ /* Provider name is included in the event format. */
+ parse_and_print_events((const unsigned char *)buffer, nread);
+ }
+
+ /* Print stats before cleanup */
+ print_stats();
+
+ /* Update binary header with final event/drop counts before closing */
+ update_binary_header();
+
+ /* Cleanup on exit */
+ close(fd);
+ eventlog_fd = -1;
+ free(buffer);
+ free(subscriptions);
+ if (output_dir != NULL) {
+ struct session_file *sf, *sf_next;
+ for (sf = STAILQ_FIRST(&session_files); sf != NULL;
+ sf = sf_next) {
+ sf_next = STAILQ_NEXT(sf, link);
+ if (sf->fp != NULL) {
+ fflush(sf->fp);
+ fclose(sf->fp);
+ sf->fp = NULL;
+ }
+ free(sf->filepath);
+ free(sf->session_id);
+ free(sf);
+ }
+ STAILQ_INIT(&session_files);
+ free(output_dir);
+ output_dir = NULL;
+ } else if (single_output.fp != NULL) {
+ fflush(single_output.fp);
+ fclose(single_output.fp);
+ single_output.fp = NULL;
+ }
+
+ return (0);
+}
+
+int
+main(int argc, char *argv[])
+{
+ /* Parse command line arguments */
+ parse_arguments(argc, argv);
+
+ /* Handle binary read mode - this bypasses capture */
+ if (binary_input_file != NULL) {
+ if (subscription_count > 0) {
+ errx(1,
+ "--read-binary cannot be used with subscribe options");
+ }
+ return (read_binary_file(binary_input_file));
+ }
+
+ /* Check if we have any subscriptions */
+ if (subscription_count == 0)
+ errx(1, "no subscriptions specified (use --capture or -c)");
+
+ /* Register atexit handler to ensure stats are always printed */
+ atexit(print_stats);
+
+ /* Set up signal handlers for cleanup on interrupt. */
+ struct sigaction sa;
+ sa.sa_handler = sigint_handler;
+ sigemptyset(&sa.sa_mask);
+ sa.sa_flags = 0;
+ (void)sigaction(SIGINT, &sa, NULL);
+ (void)sigaction(SIGTERM, &sa, NULL);
+
+ /*
+ * --duration schedules a SIGALRM that uses the same cleanup
+ * path as SIGINT/SIGTERM (see sigint_handler). The main read
+ * loop in run_eventlog_mode() checks `done` after EINTR.
+ */
+ if (duration_sec > 0) {
+ (void)sigaction(SIGALRM, &sa, NULL);
+ alarm(duration_sec);
+ }
+
+ /* Run eventlog device mode */
+ return (run_eventlog_mode());
+}
+
diff --git a/usr.bin/elog/gen_eventlog_headers.sh b/usr.bin/elog/gen_eventlog_headers.sh
new file mode 100755
--- /dev/null
+++ b/usr.bin/elog/gen_eventlog_headers.sh
@@ -0,0 +1,193 @@
+#!/bin/sh
+#
+# Copyright (c) 2026 Netflix, Inc.
+#
+# SPDX-License-Identifier: BSD-2-Clause
+#
+# Generate eventlog consumer headers from schema files. This script
+# generates both individual provider headers and the master header.
+#
+# Usage:
+# gen_eventlog_headers.sh <schema_dir> <header_dir> <srctop> \
+# <awk_script> <master_header>
+#
+
+set -e
+
+SCHEMA_DIR="$1"
+HEADER_DIR="$2"
+SRCTOP="$3"
+AWK_SCRIPT="$4"
+MASTER_HEADER="$5"
+
+if [ $# -ne 5 ]; then
+ echo "Usage: $0 <schema_dir> <header_dir> <srctop> <awk_script>" \
+ "<master_header>" >&2
+ exit 1
+fi
+
+# Resolve HEADER_DIR to an absolute path BEFORE any cd operations to
+# avoid creating directories in the source tree.
+case "${HEADER_DIR}" in
+ /*)
+ ABS_HEADER_DIR="${HEADER_DIR}"
+ ;;
+ *)
+ _ORIG_PWD="${PWD}"
+ if command -v realpath >/dev/null 2>&1; then
+ ABS_HEADER_DIR="$(realpath -m \
+ "${_ORIG_PWD}/${HEADER_DIR}")"
+ else
+ if [ "${HEADER_DIR}" = "." ]; then
+ ABS_HEADER_DIR="${_ORIG_PWD}"
+ elif [ "${HEADER_DIR}" = ".." ]; then
+ ABS_HEADER_DIR="$(cd "${_ORIG_PWD}/.." \
+ && pwd)"
+ else
+ ABS_HEADER_DIR="${_ORIG_PWD}/${HEADER_DIR}"
+ fi
+ fi
+ ;;
+esac
+
+# Refuse to write inside the source tree.
+if [ "${ABS_HEADER_DIR#${SRCTOP}/}" != "${ABS_HEADER_DIR}" ]; then
+ echo "ERROR: Header directory ${ABS_HEADER_DIR} would be" \
+ "created inside source tree ${SRCTOP}" >&2
+ exit 1
+fi
+
+mkdir -p "${ABS_HEADER_DIR}"
+
+# Print one lower-cased provider name per *_eventlog_schema.src found
+# under SCHEMA_DIR. Used by the per-provider loops below.
+list_providers() {
+ [ -d "${SCHEMA_DIR}" ] || return 0
+ for schema_path in $(find "${SCHEMA_DIR}" \
+ -name '*_eventlog_schema.src' 2>/dev/null | sort); do
+ awk '/^PROVIDER/ {print tolower($2); exit}' \
+ "${schema_path}" 2>/dev/null || true
+ done
+}
+
+# Step 1: Generate individual consumer headers for each schema.
+if [ -d "${SCHEMA_DIR}" ]; then
+ for schema_path in $(find "${SCHEMA_DIR}" \
+ -name '*_eventlog_schema.src' 2>/dev/null | sort); do
+ provider=$(awk '/^PROVIDER/ {print tolower($2); exit}' \
+ "${schema_path}" 2>/dev/null || true)
+ if [ -n "${provider}" ]; then
+ (cd "${SRCTOP}" && \
+ awk -v outdir="${ABS_HEADER_DIR}" \
+ -f "${AWK_SCRIPT}" "${schema_path}" -c)
+ fi
+ done
+fi
+
+# Step 2: Generate master consumer header that includes all provider
+# headers.
+case "${MASTER_HEADER}" in
+ /*)
+ ABS_MASTER_HEADER="${MASTER_HEADER}"
+ ;;
+ *)
+ ABS_MASTER_HEADER="${ABS_HEADER_DIR}/${MASTER_HEADER}"
+ ;;
+esac
+
+cat > "${ABS_MASTER_HEADER}" << 'EOF'
+/* Auto-generated consumer header - includes all provider consumer headers */
+#ifndef _EVENTLOG_CONSUMER_H_
+#define _EVENTLOG_CONSUMER_H_
+
+#include <sys/eventlog.h>
+EOF
+
+for provider in $(list_providers); do
+ echo "#include \"${provider}_eventlog_consumer.h\"" \
+ >> "${ABS_MASTER_HEADER}"
+done
+
+cat >> "${ABS_MASTER_HEADER}" << 'EOF'
+
+/*
+ * Check if event is SESSION_END (fixed ID, all providers). Include
+ * sys/eventlog.h for EVENTLOG_SESSION_END_ID.
+ */
+static inline bool
+eventlog_is_session_end(const char *provider_name, uint32_t event_id)
+{
+ (void)provider_name;
+ return event_id == EVENTLOG_SESSION_END_ID;
+}
+EOF
+
+cat >> "${ABS_MASTER_HEADER}" << 'EOF'
+
+/* Master formatting function that routes to per-provider formatters */
+static inline int
+eventlog_format_payload(const char *provider_name, const void *payload,
+ size_t payload_size, uint32_t event_id, char *buf, size_t bufsize)
+{
+EOF
+
+for provider in $(list_providers); do
+ {
+ echo " if (strcmp(provider_name, \"${provider}\") == 0)"
+ echo " return ${provider}_eventlog_format_payload("
+ echo " payload, payload_size, event_id,"
+ echo " buf, bufsize);"
+ } >> "${ABS_MASTER_HEADER}"
+done
+
+cat >> "${ABS_MASTER_HEADER}" << 'EOF'
+ return snprintf(buf, bufsize, "[UNKNOWN_PROVIDER:%s]",
+ provider_name);
+}
+EOF
+
+cat >> "${ABS_MASTER_HEADER}" << 'EOF'
+
+/* Master event ID to name lookup (routes to per-provider lookups) */
+static inline const char *
+eventlog_event_id_to_name(const char *provider_name, uint32_t event_id)
+{
+ if (event_id == EVENTLOG_SESSION_END_ID)
+ return "SESSION_END";
+EOF
+
+for provider in $(list_providers); do
+ {
+ echo " if (strcmp(provider_name, \"${provider}\") == 0)"
+ echo " return ${provider}_eventlog_event_id_to_name(" \
+ "event_id);"
+ } >> "${ABS_MASTER_HEADER}"
+done
+
+cat >> "${ABS_MASTER_HEADER}" << 'EOF'
+ return NULL;
+}
+EOF
+
+cat >> "${ABS_MASTER_HEADER}" << 'EOF'
+
+/* Master keyword name to bitmask lookup (routes to per-provider lookups) */
+static inline uint32_t
+eventlog_keyword_from_string(const char *provider_name, const char *name)
+{
+EOF
+
+for provider in $(list_providers); do
+ {
+ fn="${provider}_eventlog_keyword_from_string"
+ echo " if (strcmp(provider_name, \"${provider}\") == 0)"
+ echo " return ${fn}(name);"
+ } >> "${ABS_MASTER_HEADER}"
+done
+
+cat >> "${ABS_MASTER_HEADER}" << 'EOF'
+ return (0);
+}
+
+#endif /* _EVENTLOG_CONSUMER_H_ */
+EOF

File Metadata

Mime Type
text/plain
Expires
Wed, May 20, 11:28 AM (8 h, 3 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
33267378
Default Alt Text
D56979.diff (423 KB)

Event Timeline