D56979.diff
No OneTemporary
Actions

Size

423 KB

Referenced Files

None

Subscribers

None

D56979.diff
View Options

This file is larger than 256 KB, so syntax highlighting was skipped.

	diff --git a/include/eventlog/eventlog_gen.awk b/include/eventlog/eventlog_gen.awk
	new file mode 100644
	--- /dev/null
	+++ b/include/eventlog/eventlog_gen.awk
	@@ -0,0 +1,1792 @@
	+#!/usr/bin/awk -f
	+
	+#
	+# Copyright (c) 2026 Netflix, Inc.
	+#
	+# SPDX-License-Identifier: BSD-2-Clause
	+#
	+# Script to generate event log header file from schema file.
	+#
	+# usage: eventlog_gen.awk <schema.src> -h (generate producer header for kernel)
	+# eventlog_gen.awk <schema.src> -c (generate consumer header for userland)
	+
	+function usage()
	+{
	+ print "usage: eventlog_gen.awk <schema.src> -h\|-c";
	+ print " -h Generate producer header (for kernel modules generating events)";
	+ print " -c Generate consumer header (for userland tools consuming events)";
	+ exit 1;
	+}
	+
	+function die(msg, what)
	+{
	+ printf srcfile "(" fnr "): " > "/dev/stderr";
	+ printf msg "\n", what > "/dev/stderr";
	+ exit 1;
	+}
	+
	+function printh(s) {
	+ # Ensure output directory exists (only create once)
	+ if (hfile != "" && !dir_created) {
	+ # Extract directory from hfile path
	+ # CRITICAL: Always use absolute paths to prevent creating dirs in source tree
	+ if (hfile ~ /^\//) {
	+ # Absolute path - use dirname approach
	+ # Remove filename to get directory
	+ dir_path = hfile;
	+ sub(/\/[^\/]*$/, "", dir_path);
	+ } else {
	+ # Relative path - this should not happen if outdir is set correctly
	+ # But handle it by making it absolute relative to current directory
	+ # Get current directory using getline
	+ "pwd" \| getline cwd;
	+ close("pwd");
	+ split(hfile, parts, "/");
	+ dir_path = cwd;
	+ for (i = 1; i < length(parts); i++) {
	+ if (parts[i] != "" && parts[i] != ".") {
	+ if (parts[i] == "..") {
	+ sub(/\/[^\/]*$/, "", dir_path);
	+ } else {
	+ dir_path = dir_path "/" parts[i];
	+ }
	+ }
	+ }
	+ }
	+ if (dir_path != "") {
	+ # Use absolute path - ensure it starts with / to prevent relative interpretation
	+ # Quote the path to handle spaces/special chars
	+ cmd = "mkdir -p \"" dir_path "\" 2>/dev/null \|\| true";
	+ system(cmd);
	+ }
	+ dir_created = 1;
	+ }
	+ print s > hfile;
	+}
	+
	+BEGIN {
	+ nevents = 0;
	+ nkeywords = 0;
	+ nstructs = 0;
	+ nenums = 0;
	+ nflags = 0;
	+ provider = "";
	+ hfile = "";
	+ opt_h = 0;
	+ opt_c = 0;
	+ mode = ""; # "producer" or "consumer"
	+ collecting_struct = 0;
	+ collecting_enum = 0;
	+ collecting_flag = 0;
	+ struct_line = "";
	+ enum_line = "";
	+ flag_line = "";
	+ dir_created = 0;
	+
	+ # Process command line
	+ if (ARGC < 2)
	+ usage();
	+
	+ srcfile = ARGV[1];
	+ # Don't remove ARGV[1] - AWK needs it to read the file
	+
	+ for (i = 2; i < ARGC; i++) {
	+ if (ARGV[i] == "-h") {
	+ opt_h = 1;
	+ mode = "producer";
	+ ARGV[i] = ""; # Remove from ARGV so it's not processed as a file
	+ } else if (ARGV[i] == "-c") {
	+ opt_c = 1;
	+ mode = "consumer";
	+ ARGV[i] = ""; # Remove from ARGV so it's not processed as a file
	+ } else {
	+ usage();
	+ }
	+ }
	+
	+ # Exactly one mode must be specified
	+ if (!opt_h && !opt_c)
	+ usage();
	+ if (opt_h && opt_c)
	+ usage();
	+
	+ # Determine output file name (will be set in END after PROVIDER is parsed)
	+ # hfile is initialized above and will be set in END block
	+
	+ # Generate header file header (will be done in END after provider is known)
	+}
	+
	+/^[ \t]*PROVIDER/ {
	+ # Remove leading whitespace
	+ sub(/^[ \t]+/, "");
	+
	+ # Normalize whitespace - collapse multiple spaces to single space
	+ gsub(/[ \t]+/, " ");
	+
	+ if (NF < 2) {
	+ die("Invalid PROVIDER line: expected PROVIDER <name>");
	+ }
	+
	+ if (provider != "") {
	+ die("PROVIDER already defined");
	+ }
	+
	+ provider = $2;
	+ # Convert to lowercase for filename
	+ provider_lower = tolower(provider);
	+ # Output to outdir if provided, otherwise current directory
	+ # Filename depends on mode (producer vs consumer)
	+ if (outdir != "") {
	+ if (mode == "consumer") {
	+ hfile = outdir "/" provider_lower "_eventlog_consumer.h";
	+ } else {
	+ hfile = outdir "/" provider_lower "_eventlog.h";
	+ }
	+ } else {
	+ if (mode == "consumer") {
	+ hfile = provider_lower "_eventlog_consumer.h";
	+ } else {
	+ hfile = provider_lower "_eventlog.h";
	+ }
	+ }
	+
	+ next;
	+}
	+
	+collecting_struct == 1 {
	+ # Continuation line for STRUCT
	+ if (/^[ \t]/) {
	+ # Remove leading whitespace and append
	+ sub(/^[ \t]+/, "");
	+ struct_line = struct_line " " $0;
	+ next;
	+ } else {
	+ # End of continuation - process the accumulated line
	+ collecting_struct = 0;
	+ finalize_struct(struct_line);
	+ struct_line = "";
	+
	+ # Now process the current line ($0) normally - don't call next
	+ }
	+}
	+
	+/^[ \t]*STRUCT/ {
	+ # Start collecting STRUCT definition
	+ if (collecting_struct == 1) {
	+ # We were already collecting, process the previous one first
	+ finalize_struct(struct_line);
	+ struct_line = "";
	+ collecting_struct = 0;
	+ }
	+
	+ # Start new collection
	+ collecting_struct = 1;
	+ struct_line = $0;
	+ sub(/^[ \t]+/, "", struct_line);
	+ next;
	+}
	+
	+/^[ \t]*KEYWORD/ {
	+ # Remove leading whitespace
	+ sub(/^[ \t]+/, "");
	+
	+ # Normalize whitespace - collapse multiple spaces to single space
	+ gsub(/[ \t]+/, " ");
	+
	+ if (NF < 3) {
	+ die("Invalid KEYWORD line: expected KEYWORD <name> <value>");
	+ }
	+
	+ nkeywords++;
	+ keywords[nkeywords, "name"] = $2;
	+ keywords[nkeywords, "value"] = $3;
	+
	+ next;
	+}
	+
	+collecting_enum == 1 {
	+ # Continuation line for ENUM
	+ if (/^[ \t]/) {
	+ # Remove leading whitespace and append
	+ sub(/^[ \t]+/, "");
	+ enum_line = enum_line " " $0;
	+ next;
	+ } else {
	+ # End of continuation - process the accumulated line
	+ collecting_enum = 0;
	+ line = enum_line;
	+ enum_line = "";
	+
	+ # Normalize whitespace
	+ gsub(/[ \t]+/, " ", line);
	+ split(line, fields, " ");
	+
	+ if (length(fields) < 2) {
	+ die("Invalid ENUM line: expected ENUM <name> [<value1>:<name1> ...]");
	+ }
	+
	+ nenums++;
	+ enums[nenums, "name"] = fields[2];
	+
	+ # Collect all value:name pairs
	+ value_count = 0;
	+ for (i = 3; i <= length(fields); i++) {
	+ if (fields[i] == "")
	+ continue;
	+ split(fields[i], parts, ":");
	+ if (length(parts) != 2) {
	+ die("Invalid enum value definition: " fields[i] " (expected value:name)");
	+ }
	+ value_count++;
	+ enums[nenums, "value", value_count, "num"] = parts[1];
	+ enums[nenums, "value", value_count, "name"] = parts[2];
	+ }
	+ enums[nenums, "value_count"] = value_count;
	+
	+ # Now process the current line ($0) normally - don't call next
	+ }
	+}
	+
	+/^[ \t]*ENUM/ {
	+ # Start collecting ENUM definition
	+ if (collecting_enum == 1) {
	+ # We were already collecting, process the previous one first
	+ line = enum_line;
	+ enum_line = "";
	+ collecting_enum = 0;
	+
	+ gsub(/[ \t]+/, " ", line);
	+ split(line, fields, " ");
	+
	+ if (length(fields) >= 2) {
	+ nenums++;
	+ enums[nenums, "name"] = fields[2];
	+ value_count = 0;
	+ for (i = 3; i <= length(fields); i++) {
	+ if (fields[i] == "")
	+ continue;
	+ split(fields[i], parts, ":");
	+ if (length(parts) == 2) {
	+ value_count++;
	+ enums[nenums, "value", value_count, "num"] = parts[1];
	+ enums[nenums, "value", value_count, "name"] = parts[2];
	+ }
	+ }
	+ enums[nenums, "value_count"] = value_count;
	+ }
	+ }
	+
	+ # Start new collection
	+ collecting_enum = 1;
	+ enum_line = $0;
	+ sub(/^[ \t]+/, "", enum_line);
	+ next;
	+}
	+
	+collecting_flag == 1 {
	+ # Continuation line for FLAG
	+ if (/^[ \t]/) {
	+ # Remove leading whitespace and append
	+ sub(/^[ \t]+/, "");
	+ flag_line = flag_line " " $0;
	+ next;
	+ } else {
	+ # End of continuation - process the accumulated line
	+ collecting_flag = 0;
	+ line = flag_line;
	+ flag_line = "";
	+
	+ # Normalize whitespace
	+ gsub(/[ \t]+/, " ", line);
	+ split(line, fields, " ");
	+
	+ if (length(fields) < 2) {
	+ die("Invalid FLAG line: expected FLAG <name> [<value1>:<name1> ...]");
	+ }
	+
	+ nflags++;
	+ flags[nflags, "name"] = fields[2];
	+
	+ # Collect all value:name pairs
	+ value_count = 0;
	+ for (i = 3; i <= length(fields); i++) {
	+ if (fields[i] == "")
	+ continue;
	+ split(fields[i], parts, ":");
	+ if (length(parts) != 2) {
	+ die("Invalid flag value definition: " fields[i] " (expected value:name)");
	+ }
	+ value_count++;
	+ flags[nflags, "value", value_count, "num"] = parts[1];
	+ flags[nflags, "value", value_count, "name"] = parts[2];
	+ }
	+ flags[nflags, "value_count"] = value_count;
	+
	+ # Now process the current line ($0) normally - don't call next
	+ }
	+}
	+
	+/^[ \t]*FLAG/ {
	+ # Start collecting FLAG definition
	+ # First, process any pending ENUM
	+ if (collecting_enum == 1 && enum_line != "") {
	+ line = enum_line;
	+ enum_line = "";
	+ collecting_enum = 0;
	+
	+ gsub(/[ \t]+/, " ", line);
	+ split(line, fields, " ");
	+
	+ if (length(fields) >= 2) {
	+ nenums++;
	+ enums[nenums, "name"] = fields[2];
	+ value_count = 0;
	+ for (i = 3; i <= length(fields); i++) {
	+ if (fields[i] == "")
	+ continue;
	+ split(fields[i], parts, ":");
	+ if (length(parts) == 2) {
	+ value_count++;
	+ enums[nenums, "value", value_count, "num"] = parts[1];
	+ enums[nenums, "value", value_count, "name"] = parts[2];
	+ }
	+ }
	+ enums[nenums, "value_count"] = value_count;
	+ }
	+ }
	+ if (collecting_flag == 1) {
	+ # We were already collecting, process the previous one first
	+ line = flag_line;
	+ flag_line = "";
	+ collecting_flag = 0;
	+
	+ gsub(/[ \t]+/, " ", line);
	+ split(line, fields, " ");
	+
	+ if (length(fields) >= 2) {
	+ nflags++;
	+ flags[nflags, "name"] = fields[2];
	+ value_count = 0;
	+ for (i = 3; i <= length(fields); i++) {
	+ if (fields[i] == "")
	+ continue;
	+ split(fields[i], parts, ":");
	+ if (length(parts) == 2) {
	+ value_count++;
	+ flags[nflags, "value", value_count, "num"] = parts[1];
	+ flags[nflags, "value", value_count, "name"] = parts[2];
	+ }
	+ }
	+ flags[nflags, "value_count"] = value_count;
	+ }
	+ }
	+
	+ # Start new collection
	+ collecting_flag = 1;
	+ flag_line = $0;
	+ sub(/^[ \t]+/, "", flag_line);
	+ next;
	+}
	+
	+collecting_event == 1 {
	+ # Continuation line for EVENT
	+ if (/^[ \t]/) {
	+ # Remove leading whitespace and append
	+ sub(/^[ \t]+/, "");
	+ event_line = event_line " " $0;
	+ next;
	+ } else {
	+ # End of continuation - process the accumulated line
	+ collecting_event = 0;
	+ line = event_line;
	+ event_line = "";
	+
	+ # Normalize whitespace
	+ gsub(/[ \t]+/, " ", line);
	+ split(line, fields, " ");
	+
	+ if (length(fields) < 7) {
	+ die("Invalid EVENT line: expected at least 7 fields (name id level keywords struct format)");
	+ }
	+
	+ nevents++;
	+ events[nevents, "name"] = fields[2];
	+ events[nevents, "id"] = fields[3];
	+ events[nevents, "level"] = fields[4];
	+ events[nevents, "keywords"] = fields[5];
	+ events[nevents, "struct"] = fields[6];
	+
	+ # Collect format string
	+ format = "";
	+ for (i = 7; i <= length(fields); i++) {
	+ if (i > 7)
	+ format = format " ";
	+ format = format fields[i];
	+ }
	+ gsub(/^"/, "", format);
	+ gsub(/"$/, "", format);
	+ events[nevents, "format"] = format;
	+
	+ # Now process the current line ($0) normally - don't call next
	+ }
	+}
	+
	+/^[ \t]*EVENT/ {
	+ # Start collecting EVENT definition
	+ if (collecting_event == 1) {
	+ # We were already collecting, process the previous one first
	+ line = event_line;
	+ event_line = "";
	+ collecting_event = 0;
	+
	+ gsub(/[ \t]+/, " ", line);
	+ split(line, fields, " ");
	+
	+ if (length(fields) >= 7) {
	+ nevents++;
	+ events[nevents, "name"] = fields[2];
	+ events[nevents, "id"] = fields[3];
	+ events[nevents, "level"] = fields[4];
	+ events[nevents, "keywords"] = fields[5];
	+ events[nevents, "struct"] = fields[6];
	+ format = "";
	+ for (i = 7; i <= length(fields); i++) {
	+ if (i > 7)
	+ format = format " ";
	+ format = format fields[i];
	+ }
	+ gsub(/^"/, "", format);
	+ gsub(/"$/, "", format);
	+ events[nevents, "format"] = format;
	+ }
	+ }
	+
	+ # Start new collection
	+ collecting_event = 1;
	+ event_line = $0;
	+ sub(/^[ \t]+/, "", event_line);
	+ next;
	+}
	+
	+/^[ \t]*\/\// {
	+ # Skip C++ style comments (lines starting with //)
	+ next;
	+}
	+
	+/^[ \t]*#/ {
	+ # Skip comments (lines starting with #)
	+ next;
	+}
	+
	+/^[ \t]\/\/ {
	+ # Skip C-style comment blocks - just skip the line
	+ next;
	+}
	+
	+/^[ \t]\/ {
	+ # Skip comment continuation lines
	+ next;
	+}
	+
	+/^[ \t]*$/ {
	+ # Skip empty lines
	+ next;
	+}
	+
	+{
	+ # Unknown line - skip silently (could be part of multi-line comments)
	+ next;
	+}
	+
	+function warn(msg)
	+{
	+ printf "eventlog_gen.awk: " msg "\n" > "/dev/stderr";
	+}
	+
	+# Parse a STRUCT definition line (whitespace-normalized) into the structs[] tables.
	+# Called from the grammar entry points and from END for trailing STRUCTs.
	+function finalize_struct(struct_input, line, fields, parts, i, j, annotation, ftype, count_field, count_field_idx, max_str, max_val, bracket_pos, colon_pos, head, tail, bparts, fname)
	+{
	+ line = struct_input;
	+ gsub(/[ \t]+/, " ", line);
	+ split(line, fields, " ");
	+
	+ if (length(fields) < 2) {
	+ die("Invalid STRUCT line: expected STRUCT <name> [<field1>:<type1> ...]");
	+ }
	+
	+ nstructs++;
	+ structs[nstructs, "name"] = fields[2];
	+ structs[nstructs, "has_varlen"] = 0;
	+
	+ field_count = 0;
	+ for (i = 3; i <= length(fields); i++) {
	+ if (fields[i] == "")
	+ continue;
	+
	+ # A VARLEN field uses the syntax: name:type[countfield:maxcount]
	+ # The closing ']' may appear anywhere; we split on ':' only outside
	+ # the square brackets by stripping the bracket portion first.
	+ # Distinguish this from the existing fixed char array syntax
	+ # (name:char[N]) by requiring a ':' inside the brackets.
	+ bracket_pos = index(fields[i], "[");
	+ if (bracket_pos > 0) {
	+ # Everything before '[' is "name:type"; everything between
	+ # '[' and ']' is "countfield:maxcount".
	+ if (substr(fields[i], length(fields[i]), 1) != "]") {
	+ die("Invalid bracketed field: " fields[i] " (missing ']')");
	+ }
	+ head = substr(fields[i], 1, bracket_pos - 1);
	+ tail = substr(fields[i], bracket_pos + 1, length(fields[i]) - bracket_pos - 1);
	+ # Fixed char[N] (no ':' inside brackets) falls through to the
	+ # legacy "name:type" parsing below, which treats the full
	+ # "type[N]" substring as the type spelling.
	+ if (index(tail, ":") == 0) {
	+ # Fall through to legacy fixed-size handling.
	+ } else {
	+ # VARLEN field
	+ colon_pos = index(head, ":");
	+ if (colon_pos == 0) {
	+ die("Invalid varlen field: " fields[i] " (expected name:type[countfield:max])");
	+ }
	+ fname = substr(head, 1, colon_pos - 1);
	+ ftype = substr(head, colon_pos + 1);
	+ split(tail, bparts, ":");
	+ if (length(bparts) != 2) {
	+ die("Invalid varlen field: " fields[i] " (expected [countfield:max])");
	+ }
	+ count_field = bparts[1];
	+ max_str = bparts[2];
	+ if (max_str !~ /^[0-9]+$/ \|\| (max_str + 0) == 0) {
	+ die("Invalid varlen max: " max_str " (must be positive integer)");
	+ }
	+ # Disallow char[] and annotations on varlen fields.
	+ if (ftype == "char" \|\| match(ftype, /^char\[[0-9]+\]$/)) {
	+ die("Varlen field " fname " may not use char/char[] element type");
	+ }
	+ field_count++;
	+ structs[nstructs, "field", field_count, "name"] = fname;
	+ structs[nstructs, "field", field_count, "type"] = ftype;
	+ structs[nstructs, "field", field_count, "is_varlen"] = 1;
	+ structs[nstructs, "field", field_count, "varlen_count"] = count_field;
	+ structs[nstructs, "field", field_count, "varlen_max"] = max_val = (max_str + 0);
	+ structs[nstructs, "has_varlen"] = 1;
	+ structs[nstructs, "varlen_field_idx"] = field_count;
	+ # Require varlen to be the last field in the struct.
	+ # (We enforce this after the loop by checking field_count position.)
	+ continue;
	+ }
	+ }
	+
	+ # Fixed field: split name:type[:annotation]
	+ split(fields[i], parts, ":");
	+ if (length(parts) < 2 \|\| length(parts) > 3) {
	+ die("Invalid field definition: " fields[i] " (expected field:type[:enum_or_flag_or_hex])");
	+ }
	+ field_count++;
	+ structs[nstructs, "field", field_count, "name"] = parts[1];
	+ structs[nstructs, "field", field_count, "type"] = parts[2];
	+ if (length(parts) == 3) {
	+ annotation = parts[3];
	+ if (annotation == "hex") {
	+ structs[nstructs, "field", field_count, "hex_format"] = 1;
	+ } else if (annotation == "ntohs") {
	+ structs[nstructs, "field", field_count, "ntohs"] = 1;
	+ } else if (substr(annotation, 1, 5) == "enum_") {
	+ structs[nstructs, "field", field_count, "enum_type"] = substr(annotation, 6);
	+ } else if (substr(annotation, 1, 5) == "flag_") {
	+ structs[nstructs, "field", field_count, "flag_type"] = substr(annotation, 6);
	+ } else {
	+ die("Invalid annotation: " annotation " (expected hex, ntohs, enum_<name>, or flag_<name>)");
	+ }
	+ }
	+ }
	+ structs[nstructs, "field_count"] = field_count;
	+
	+ # Validate varlen placement/references.
	+ if (structs[nstructs, "has_varlen"]) {
	+ if (structs[nstructs, "varlen_field_idx"] != field_count) {
	+ die("Varlen field in STRUCT " structs[nstructs, "name"] " must be the last field");
	+ }
	+ count_field = structs[nstructs, "field", field_count, "varlen_count"];
	+ count_field_idx = 0;
	+ for (j = 1; j < field_count; j++) {
	+ if (structs[nstructs, "field", j, "name"] == count_field) {
	+ count_field_idx = j;
	+ break;
	+ }
	+ }
	+ if (count_field_idx == 0) {
	+ die("Varlen count field '" count_field "' not found in STRUCT " structs[nstructs, "name"]);
	+ }
	+ # The count field must be an unsigned integral type (uint8/16/32/64 or compatible).
	+ ftype = structs[nstructs, "field", count_field_idx, "type"];
	+ if (ftype != "uint8_t" && ftype != "uint16_t" && ftype != "uint32_t" &&
	+ ftype != "uint64_t" && ftype != "u_char" && ftype != "u_short" &&
	+ ftype != "u_int" && ftype != "u_long" && ftype != "size_t") {
	+ die("Varlen count field '" count_field "' must be an unsigned scalar (got " ftype ")");
	+ }
	+ structs[nstructs, "varlen_count_idx"] = count_field_idx;
	+ }
	+}
	+
	+function get_type_size(type)
	+{
	+ # Map C types to their sizes (assuming 64-bit platform)
	+ if (type == "uint8_t" \|\| type == "int8_t" \|\| type == "char" \|\| type == "u_char")
	+ return 1;
	+ if (type == "uint16_t" \|\| type == "int16_t" \|\| type == "short" \|\| type == "u_short")
	+ return 2;
	+ if (type == "uint32_t" \|\| type == "int32_t" \|\| type == "int" \|\| type == "u_int" \|\| type == "lwpid_t")
	+ return 4;
	+ if (type == "uint64_t" \|\| type == "int64_t" \|\| type == "long" \|\| type == "u_long" \|\| type == "size_t")
	+ return 8;
	+ if (type == "uintptr_t" \|\| type == "intptr_t")
	+ return 8;
	+ if (type == "void" \|\| type == "void ")
	+ return 8; # Pointer size on 64-bit platform
	+ if (type == "in_addr_t" \|\| type == "struct in_addr")
	+ return 4; # IPv4 address is 4 bytes
	+ if (type == "in6_addr_t" \|\| type == "struct in6_addr")
	+ return 16; # IPv6 address is 16 bytes
	+ # char[N] - fixed-size char array (e.g., char[64])
	+ if (match(type, /^char\[[0-9]+\]$/)) {
	+ sub(/^char\[/, "", type);
	+ sub(/\]$/, "", type);
	+ return type + 0;
	+ }
	+ # Default to 4 bytes for unknown types (conservative)
	+ warn("Unknown type size for: " type ", assuming 4 bytes");
	+ return 4;
	+}
	+
	+function get_printf_format(field_type, enum_type, flag_type, hex_format)
	+{
	+ # Return printf format specifier based on field type
	+ if (enum_type != "" \|\| flag_type != "")
	+ return "%s"; # Enum/flag fields are converted to strings
	+ if (field_type == "in_addr_t" \|\| field_type == "struct in_addr")
	+ return "%s"; # IP addresses are converted to strings
	+ if (field_type == "in6_addr_t" \|\| field_type == "struct in6_addr")
	+ return "%s"; # IPv6 addresses are converted to strings
	+ if (field_type == "void" \|\| field_type == "void ")
	+ return "%p"; # Pointers
	+ # char[N] - fixed-size char array, treat as string
	+ if (match(field_type, /^char\[[0-9]+\]$/))
	+ return "%s"; # Char arrays displayed as strings
	+ # Handle hex format if requested
	+ if (hex_format) {
	+ if (field_type == "uint8_t" \|\| field_type == "u_char")
	+ return "%x";
	+ if (field_type == "uint16_t" \|\| field_type == "u_short")
	+ return "%x";
	+ if (field_type == "uint32_t" \|\| field_type == "u_int" \|\| field_type == "lwpid_t")
	+ return "%x";
	+ if (field_type == "uint64_t" \|\| field_type == "u_long" \|\| field_type == "size_t")
	+ return "%lx";
	+ }
	+ if (field_type == "uint8_t" \|\| field_type == "u_char")
	+ return "%u";
	+ if (field_type == "int8_t" \|\| field_type == "char")
	+ return "%d";
	+ if (field_type == "uint16_t" \|\| field_type == "u_short")
	+ return "%u";
	+ if (field_type == "int16_t" \|\| field_type == "short")
	+ return "%d";
	+ if (field_type == "uint32_t" \|\| field_type == "u_int" \|\| field_type == "lwpid_t")
	+ return "%u";
	+ if (field_type == "int32_t" \|\| field_type == "int")
	+ return "%d";
	+ if (field_type == "uint64_t" \|\| field_type == "u_long" \|\| field_type == "size_t")
	+ return "%lu";
	+ if (field_type == "int64_t" \|\| field_type == "long")
	+ return "%ld";
	+ # Default to %u for unknown types
	+ warn("Unknown printf format for type: " field_type ", using %u");
	+ return "%u";
	+}
	+
	+END {
	+ # Process any remaining collected STRUCT
	+ if (collecting_struct == 1 && struct_line != "") {
	+ finalize_struct(struct_line);
	+ struct_line = "";
	+ collecting_struct = 0;
	+ }
	+
	+ # Process any remaining collected EVENT
	+ if (collecting_event == 1 && event_line != "") {
	+ line = event_line;
	+ gsub(/[ \t]+/, " ", line);
	+ split(line, fields, " ");
	+
	+ if (length(fields) >= 7) {
	+ nevents++;
	+ events[nevents, "name"] = fields[2];
	+ events[nevents, "id"] = fields[3];
	+ events[nevents, "level"] = fields[4];
	+ events[nevents, "keywords"] = fields[5];
	+ events[nevents, "struct"] = fields[6];
	+ format = "";
	+ for (i = 7; i <= length(fields); i++) {
	+ if (i > 7)
	+ format = format " ";
	+ format = format fields[i];
	+ }
	+ gsub(/^"/, "", format);
	+ gsub(/"$/, "", format);
	+ events[nevents, "format"] = format;
	+ }
	+ }
	+
	+ # Process any remaining collected ENUM
	+ if (collecting_enum == 1 && enum_line != "") {
	+ line = enum_line;
	+ gsub(/[ \t]+/, " ", line);
	+ split(line, fields, " ");
	+
	+ if (length(fields) >= 2) {
	+ nenums++;
	+ enums[nenums, "name"] = fields[2];
	+ value_count = 0;
	+ for (i = 3; i <= length(fields); i++) {
	+ if (fields[i] == "")
	+ continue;
	+ split(fields[i], parts, ":");
	+ if (length(parts) == 2) {
	+ value_count++;
	+ enums[nenums, "value", value_count, "num"] = parts[1];
	+ enums[nenums, "value", value_count, "name"] = parts[2];
	+ }
	+ }
	+ enums[nenums, "value_count"] = value_count;
	+ }
	+ }
	+
	+ # Process any remaining collected FLAG
	+ if (collecting_flag == 1 && flag_line != "") {
	+ line = flag_line;
	+ gsub(/[ \t]+/, " ", line);
	+ split(line, fields, " ");
	+
	+ if (length(fields) >= 2) {
	+ nflags++;
	+ flags[nflags, "name"] = fields[2];
	+ value_count = 0;
	+ for (i = 3; i <= length(fields); i++) {
	+ if (fields[i] == "")
	+ continue;
	+ split(fields[i], parts, ":");
	+ if (length(parts) == 2) {
	+ value_count++;
	+ flags[nflags, "value", value_count, "num"] = parts[1];
	+ flags[nflags, "value", value_count, "name"] = parts[2];
	+ }
	+ }
	+ flags[nflags, "value_count"] = value_count;
	+ }
	+ }
	+
	+ if (provider == "") {
	+ die("PROVIDER must be defined at the beginning of the schema file");
	+ }
	+
	+ if (nevents == 0) {
	+ die("No events found in schema file");
	+ }
	+
	+ # Add KEYWORD SESSION for reserved events (0x80000000 = EVENTLOG_KEYWORD_SESSION)
	+ for (i = 1; i <= nkeywords; i++) {
	+ if (keywords[i, "name"] == "SESSION")
	+ break;
	+ }
	+ if (i > nkeywords) {
	+ nkeywords++;
	+ keywords[nkeywords, "name"] = "SESSION";
	+ keywords[nkeywords, "value"] = "0x80000000";
	+ }
	+ # Add struct for SESSION_CREATE payload if schema does not define it
	+ has_session_create = 0;
	+ for (si = 1; si <= nstructs; si++) {
	+ if (structs[si, "name"] == "SESSION_CREATE") {
	+ has_session_create = 1;
	+ break;
	+ }
	+ }
	+ if (!has_session_create) {
	+ nstructs++;
	+ structs[nstructs, "name"] = "SESSION_CREATE";
	+ structs[nstructs, "field", 1, "name"] = "_unused";
	+ structs[nstructs, "field", 1, "type"] = "uint8_t";
	+ structs[nstructs, "field_count"] = 1;
	+ }
	+ # SESSION_END has no payload - uses NONE struct
	+ # Add reserved events with fixed IDs (UINT32_MAX-1, UINT32_MAX) if not in schema
	+ has_session_create_evt = 0;
	+ has_session_end_evt = 0;
	+ for (ei = 1; ei <= nevents; ei++) {
	+ if (events[ei, "name"] == "SESSION_CREATE") has_session_create_evt = 1;
	+ if (events[ei, "name"] == "SESSION_END") has_session_end_evt = 1;
	+ }
	+ if (!has_session_create_evt) {
	+ nevents++;
	+ events[nevents, "name"] = "SESSION_CREATE";
	+ events[nevents, "id"] = 4294967294; # UINT32_MAX-1
	+ events[nevents, "level"] = "INFO";
	+ events[nevents, "keywords"] = "SESSION";
	+ events[nevents, "struct"] = "SESSION_CREATE";
	+ events[nevents, "format"] = "Session created";
	+ }
	+ if (!has_session_end_evt) {
	+ nevents++;
	+ events[nevents, "name"] = "SESSION_END";
	+ events[nevents, "id"] = 4294967295; # UINT32_MAX
	+ events[nevents, "level"] = "INFO";
	+ events[nevents, "keywords"] = "SESSION";
	+ events[nevents, "struct"] = "NONE";
	+ events[nevents, "format"] = "Session ended";
	+ }
	+
	+ if (hfile == "") {
	+ provider_lower = tolower(provider);
	+ # If outdir was provided via -v outdir=..., use it
	+ if (outdir != "") {
	+ if (mode == "consumer") {
	+ hfile = outdir "/" provider_lower "_eventlog_consumer.h";
	+ } else {
	+ hfile = outdir "/" provider_lower "_eventlog.h";
	+ }
	+ } else {
	+ if (mode == "consumer") {
	+ hfile = provider_lower "_eventlog_consumer.h";
	+ } else {
	+ hfile = provider_lower "_eventlog.h";
	+ }
	+ }
	+ }
	+
	+ # Generate header file header
	+ generated = "@" "generated";
	+ printh("/*");
	+ printh(" * THIS FILE AUTOMATICALLY GENERATED. DO NOT EDIT.");
	+ printh(" *");
	+ printh(" * Generated from " srcfile);
	+ printh(" * by eventlog_gen.awk");
	+ printh(" */");
	+ printh("");
	+ printh("#ifndef _" toupper(provider) "_EVENTLOG_H_");
	+ printh("#define _" toupper(provider) "_EVENTLOG_H_");
	+ printh("");
	+ printh("#include <sys/eventlog.h>");
	+ # Check if any struct uses in_addr, in6_addr, char[N], or declares a
	+ # trailing variable-length array.
	+ needs_inet = 0;
	+ needs_inet6 = 0;
	+ needs_string = 0;
	+ needs_iovec = 0;
	+ for (i = 1; i <= nstructs; i++) {
	+ if (structs[i, "has_varlen"] == 1)
	+ needs_iovec = 1;
	+ field_count = structs[i, "field_count"];
	+ for (j = 1; j <= field_count; j++) {
	+ field_type = structs[i, "field", j, "type"];
	+ if (field_type == "in_addr_t" \|\| field_type == "struct in_addr")
	+ needs_inet = 1;
	+ if (field_type == "in6_addr_t" \|\| field_type == "struct in6_addr")
	+ needs_inet6 = 1;
	+ if (match(field_type, /^char\[[0-9]+\]$/))
	+ needs_string = 1;
	+ }
	+ }
	+ if (needs_inet \|\| needs_inet6)
	+ printh("#include <netinet/in.h>");
	+ # libkern (strncpy, bzero) only for kernel producer; consumer uses string.h
	+ if (needs_string && mode == "producer")
	+ printh("#include <sys/libkern.h>");
	+ # struct iovec for the gather write path (producer-only).
	+ if (needs_iovec && mode == "producer")
	+ printh("#include <sys/uio.h>");
	+ printh("");
	+
	+ # Generate provider instance and macros at the top
	+ provider_upper = toupper(provider);
	+ provider_lower = tolower(provider);
	+
	+ # Calculate maximum event size by finding the largest struct.
	+ # For varlen structs, the max includes the tail: sizeof(fixed head) + max_elements * sizeof(element).
	+ max_size = 0;
	+ for (i = 1; i <= nevents; i++) {
	+ struct_name = events[i, "struct"];
	+ if (struct_name == "NONE")
	+ continue;
	+ struct_idx = 0;
	+ for (j = 1; j <= nstructs; j++) {
	+ if (structs[j, "name"] == struct_name) {
	+ struct_idx = j;
	+ break;
	+ }
	+ }
	+ if (struct_idx == 0) {
	+ die("Struct " struct_name " not found for event " events[i, "name"]);
	+ }
	+ field_count = structs[struct_idx, "field_count"];
	+ event_size = 0;
	+ for (j = 1; j <= field_count; j++) {
	+ field_type = structs[struct_idx, "field", j, "type"];
	+ if (structs[struct_idx, "field", j, "is_varlen"] == 1) {
	+ event_size += get_type_size(field_type) * structs[struct_idx, "field", j, "varlen_max"];
	+ } else {
	+ event_size += get_type_size(field_type);
	+ }
	+ }
	+ if (event_size > max_size) {
	+ max_size = event_size;
	+ }
	+ }
	+
	+
	+ # Generate struct type definitions.
	+ # For varlen structs the trailing array is NOT declared as a C member --
	+ # callers access it through the generated accessor helper. We emit a
	+ # comment documenting the wire layout.
	+ printh("/* Event data structures */");
	+ for (i = 1; i <= nstructs; i++) {
	+ struct_name = structs[i, "name"];
	+ field_count = structs[i, "field_count"];
	+ # For structs with a VARLEN trailing array, force struct alignment
	+ # (and therefore sizeof) to be a multiple of alignof(elem_type) so
	+ # the trailing array starts on an aligned offset. Without this, a
	+ # head that ends at e.g. offset 12 followed by a uint64[] trailer
	+ # would produce an unaligned cast (-Wcast-align) and, worse, a
	+ # real unaligned access on strict-alignment architectures.
	+ struct_align = 0;
	+ if (structs[i, "has_varlen"] == 1) {
	+ vidx = structs[i, "varlen_field_idx"];
	+ struct_align = get_type_size(structs[i, "field", vidx, "type"]);
	+ }
	+ if (struct_align > 0) {
	+ printh("struct __aligned(" struct_align ") " \
	+ provider_lower "_eventlog_" tolower(struct_name) " {");
	+ } else {
	+ printh("struct " provider_lower "_eventlog_" tolower(struct_name) " {");
	+ }
	+ for (j = 1; j <= field_count; j++) {
	+ field_name = structs[i, "field", j, "name"];
	+ field_type = structs[i, "field", j, "type"];
	+ if (structs[i, "field", j, "is_varlen"] == 1) {
	+ # Documentation only - the trailing array lives in the wire payload,
	+ # not in this C struct. Use the accessor helper to read it.
	+ printh("\t/* Followed on the wire by " field_type " " field_name \
	+ "[" structs[i, "field", j, "varlen_count"] "]; " \
	+ "max " structs[i, "field", j, "varlen_max"] " elements */");
	+ continue;
	+ }
	+ # Map special types to their C equivalents
	+ if (field_type == "in_addr_t")
	+ field_type = "struct in_addr";
	+ else if (field_type == "in6_addr_t")
	+ field_type = "struct in6_addr";
	+ # char[N] -> char field_name[N];
	+ if (match(field_type, /^char\[([0-9]+)\]$/)) {
	+ array_size = substr(field_type, RSTART + 5, RLENGTH - 6);
	+ printh("\tchar\t" field_name "[" array_size "];");
	+ } else {
	+ printh("\t" field_type "\t" field_name ";");
	+ }
	+ }
	+ printh("};");
	+ # Emit a MAX constant + accessor helper for any varlen field.
	+ if (structs[i, "has_varlen"] == 1) {
	+ vidx = structs[i, "varlen_field_idx"];
	+ vname = structs[i, "field", vidx, "name"];
	+ vtype = structs[i, "field", vidx, "type"];
	+ vcount = structs[i, "field", vidx, "varlen_count"];
	+ vmax_define = toupper(provider) "_EVENTLOG_" toupper(struct_name) \
	+ "_" toupper(vname) "_MAX";
	+ printh("#define\t" vmax_define "\t" \
	+ structs[i, "field", vidx, "varlen_max"]);
	+ printh("");
	+ printh("/*");
	+ printh(" * Read the trailing " vname "[] array from a " struct_name " wire payload.");
	+ printh(" * Returns a pointer to the first element, or NULL if the payload is too");
	+ printh(" * small to hold the claimed count. Callers should use evt->" vcount);
	+ printh(" * (already bounded to " vmax_define ") as the element count.");
	+ printh(" */");
	+ printh("static inline const " vtype " *");
	+ printh(provider_lower "_eventlog_" tolower(struct_name) "_" tolower(vname) \
	+ "(const struct " provider_lower "_eventlog_" tolower(struct_name) " *evt, size_t payload_size)");
	+ printh("{");
	+ printh("\tsize_t __head = sizeof(*evt);");
	+ printh("\tsize_t __n = (size_t)evt->" vcount ";");
	+ printh("\tif (__n > " vmax_define ")");
	+ printh("\t\treturn NULL;");
	+ printh("\tif (payload_size < __head + __n * sizeof(" vtype "))");
	+ printh("\t\treturn NULL;");
	+ # Cast via const void * to silence -Wcast-align. The struct
	+ # definition above carries __aligned(sizeof(" vtype ")) so the
	+ # trailing array is guaranteed to start on an aligned offset.
	+ printh("\treturn (const " vtype " )(const void )((const char *)evt + __head);");
	+ printh("}");
	+ }
	+ printh("");
	+ }
	+
	+ # Generate keyword flag definitions
	+ printh("/* Event keyword flags */");
	+ # Keywords defined in schema
	+ next_bit = 0x0001; # Start from first bit
	+ for (i = 1; i <= nkeywords; i++) {
	+ # Convert value to hex if it's numeric, otherwise use as-is
	+ value = keywords[i, "value"];
	+ if (value ~ /^0x[0-9a-fA-F]+$/) {
	+ # Already hex - use as-is
	+ printh("#define\t" toupper(provider) "_EVENTLOG_KEYWORD_" keywords[i, "name"] "\t" value);
	+ } else if (value ~ /^[0-9]+$/) {
	+ # Decimal - if it's a small number (like 1), treat as relative bit position
	+ # Otherwise use the value directly converted to hex
	+ num_value = value + 0;
	+ if (num_value < 16) {
	+ # Small number - treat as relative bit position
	+ # 1 = first bit (0x0001), 2 = second bit (0x0002), etc.
	+ # Calculate: 2^(num_value-1)
	+ bit_shift = num_value - 1;
	+ bit_value = 2 ^ bit_shift;
	+ printh("#define\t" toupper(provider) "_EVENTLOG_KEYWORD_" keywords[i, "name"] "\t0x" sprintf("%04x", bit_value));
	+ } else {
	+ # Large number - use directly as hex
	+ printh("#define\t" toupper(provider) "_EVENTLOG_KEYWORD_" keywords[i, "name"] "\t0x" sprintf("%04x", num_value));
	+ }
	+ } else {
	+ # Use as-is (might be a constant)
	+ printh("#define\t" toupper(provider) "_EVENTLOG_KEYWORD_" keywords[i, "name"] "\t" value);
	+ }
	+ }
	+ printh("");
	+
	+ # Generate enum constant definitions (needed for both modes)
	+ if (nenums > 0) {
	+ printh("/* Enum constant definitions */");
	+ for (i = 1; i <= nenums; i++) {
	+ enum_name = enums[i, "name"];
	+ value_count = enums[i, "value_count"];
	+ for (j = 1; j <= value_count; j++) {
	+ value_num = enums[i, "value", j, "num"];
	+ value_name = enums[i, "value", j, "name"];
	+ printh("#define\t" toupper(provider) "_EVENTLOG_" toupper(enum_name) "_" toupper(value_name) "\t" value_num);
	+ }
	+ }
	+ printh("");
	+ }
	+
	+ # Generate flag constant definitions (needed for both modes)
	+ if (nflags > 0) {
	+ printh("/* Flag constant definitions */");
	+ for (i = 1; i <= nflags; i++) {
	+ flag_name = flags[i, "name"];
	+ value_count = flags[i, "value_count"];
	+ for (j = 1; j <= value_count; j++) {
	+ value_num = flags[i, "value", j, "num"];
	+ value_name = flags[i, "value", j, "name"];
	+ printh("#define\t" toupper(provider) "_EVENTLOG_FLAG_" toupper(value_name) "\t" value_num);
	+ }
	+ }
	+ printh("");
	+ }
	+
	+ # Generate event ID constants (both modes)
	+ printh("/* Event ID constants */");
	+ for (i = 1; i <= nevents; i++) {
	+ printh("#define\t" toupper(provider) "_EVENTLOG_" events[i, "name"] "_ID\t" events[i, "id"]);
	+ }
	+ printh("");
	+
	+ # Generate event definitions (producer mode only)
	+ if (mode == "producer") {
	+ printh("/* Events */");
	+ printh("");
	+
	+ for (i = 1; i <= nevents; i++) {
	+ # Convert level to enum value
	+ level_enum = "EVENTLOG_LEVEL_INFO";
	+ if (events[i, "level"] == "ERROR")
	+ level_enum = "EVENTLOG_LEVEL_ERROR";
	+ else if (events[i, "level"] == "WARN")
	+ level_enum = "EVENTLOG_LEVEL_WARN";
	+ else if (events[i, "level"] == "VERBOSE")
	+ level_enum = "EVENTLOG_LEVEL_VERBOSE";
	+ else if (events[i, "level"] == "TRACE")
	+ level_enum = "EVENTLOG_LEVEL_TRACE";
	+
	+ # Parse keywords into keyword_flags
	+ keywords_str = events[i, "keywords"];
	+ keyword_flags = "";
	+ # Only use keywords explicitly defined in schema
	+ for (j = 1; j <= nkeywords; j++) {
	+ keyword_name = keywords[j, "name"];
	+ if (index(keywords_str, keyword_name) > 0)
	+ keyword_flags = keyword_flags " \| " toupper(provider) "_EVENTLOG_KEYWORD_" keyword_name;
	+ }
	+
	+ # Remove leading " \| "
	+ if (keyword_flags != "") {
	+ # Remove leading space and " \| " (substring starting at position 4)
	+ keyword_flags = substr(keyword_flags, 4);
	+ } else {
	+ keyword_flags = "0";
	+ }
	+
	+ # Generate defines for event
	+ event_name = events[i, "name"];
	+ struct_name = events[i, "struct"];
	+ provider_upper = toupper(provider);
	+ provider_lower = tolower(provider);
	+
	+ # Generate comment block for this event
	+ printh("/*");
	+ printh(" * " provider_upper " " event_name " Event");
	+ printh(" */");
	+ printh("");
	+
	+ # Generate enabled macro - uses session effective_level/effective_keywords
	+ printh("#define " provider_upper "_EVENTLOG_" event_name "_ENABLED(__session) \\");
	+ printh("\t((__session != NULL) && \\");
	+ printh("\t ((__session)->effective_level >= " level_enum ") && \\");
	+ printh("\t (((__session)->effective_keywords & (" keyword_flags ")) != 0))");
	+ printh("");
	+
	+ if (struct_name == "NONE") {
	+ # No-payload event: macros take only __session
	+ printh("/* struct eventlog_session session /");
	+ printh("#define " provider_upper "_EVENTLOG_" event_name "_LOG_ALWAYS(__session) \\");
	+ printh("\tdo { \\");
	+ printh("\t\teventlog_event_write(__session, " events[i, "id"] ", " level_enum ", (" keyword_flags "), NULL, 0); \\");
	+ printh("\t} while (0)");
	+ printh("");
	+ printh("/* struct eventlog_session session /");
	+ printh("#define " provider_upper "_EVENTLOG_" event_name "_LOG(__session) \\");
	+ printh("\tdo { \\");
	+ printh("\t\tif (" provider_upper "_EVENTLOG_" event_name "_ENABLED(__session)) { \\");
	+ printh("\t\t\t" provider_upper "_EVENTLOG_" event_name "_LOG_ALWAYS(__session); \\");
	+ printh("\t\t} \\");
	+ printh("\t} while (0)");
	+ printh("");
	+ printh("#define " provider_upper "_EVENTLOG_" event_name "_FORMAT \"" events[i, "format"] "\"");
	+ printh("");
	+ } else {
	+ # Find the struct definition for this event
	+ struct_idx = 0;
	+ for (j = 1; j <= nstructs; j++) {
	+ if (structs[j, "name"] == struct_name) {
	+ struct_idx = j;
	+ break;
	+ }
	+ }
	+
	+ if (struct_idx == 0) {
	+ die("Struct " struct_name " not found for event " event_name);
	+ }
	+
	+ # Generate _LOG_ALWAYS and _LOG macros
	+ field_count = structs[struct_idx, "field_count"];
	+ has_varlen = (structs[struct_idx, "has_varlen"] == 1);
	+ varlen_idx = has_varlen ? structs[struct_idx, "varlen_field_idx"] : 0;
	+
	+ # For varlen events, the last parameter is a pointer to a user-supplied
	+ # source array of elements; the count is taken from the count_field
	+ # parameter already in the signature. We do not append an extra count
	+ # parameter -- the count is whichever scalar field the schema named.
	+
	+ # Build parameter list (without types) and comment list (with types)
	+ # Use __ prefix for all parameters to avoid collisions with struct field names
	+ param_list = "__session";
	+ # Comment uses non-prefixed names for readability
	+ param_comment = "struct eventlog_session *session";
	+ struct_type_name = provider_lower "_eventlog_" tolower(struct_name);
	+
	+ for (j = 1; j <= field_count; j++) {
	+ field_name = structs[struct_idx, "field", j, "name"];
	+ field_type = structs[struct_idx, "field", j, "type"];
	+
	+ # Add to parameter list with __ prefix
	+ param_list = param_list ", __" field_name;
	+ # Comment uses non-prefixed names for readability
	+ if (structs[struct_idx, "field", j, "is_varlen"] == 1) {
	+ param_comment = param_comment ", const " field_type " *" field_name;
	+ } else {
	+ param_comment = param_comment ", " field_type " " field_name;
	+ }
	+ }
	+
	+ # Generate _LOG_ALWAYS macro (does the actual logging)
	+ # Use __ prefixed parameter names to avoid collisions with struct field names
	+ # Check if any field is char[N] (requires strncpy, not direct assignment)
	+ has_char_array = 0;
	+ for (j = 1; j <= field_count; j++) {
	+ field_type = structs[struct_idx, "field", j, "type"];
	+ if (match(field_type, /^char\[[0-9]+\]$/)) {
	+ has_char_array = 1;
	+ break;
	+ }
	+ }
	+
	+ printh("/* " param_comment " */");
	+ printh("#define " provider_upper "_EVENTLOG_" event_name "_LOG_ALWAYS(" param_list ") \\");
	+ printh("\tdo { \\");
	+ if (has_varlen) {
	+ # Build a 2-element iovec: [0] = the fixed head on the
	+ # stack, [1] = the caller's source array. The framework's
	+ # gather write path copies directly into the subscriber
	+ # ring buffer, avoiding the pre-copy and the worst-case
	+ # stack footprint of a composite struct.
	+ varlen_field_name = structs[struct_idx, "field", varlen_idx, "name"];
	+ varlen_elem_type = structs[struct_idx, "field", varlen_idx, "type"];
	+ varlen_count_name = structs[struct_idx, "field", varlen_idx, "varlen_count"];
	+ varlen_max_define = provider_upper "_EVENTLOG_" toupper(struct_name) \
	+ "_" toupper(varlen_field_name) "_MAX";
	+
	+ # All declarations first so the expansion is valid in a
	+ # nested block in strict C modes.
	+ printh("\t\tstruct " struct_type_name " __head; \\");
	+ printh("\t\tstruct iovec __iov[2]; \\");
	+ printh("\t\tsize_t __n = (size_t)(__" varlen_count_name "); \\");
	+ printh("\t\tif (__n > " varlen_max_define ") \\");
	+ printh("\t\t\t__n = " varlen_max_define "; \\");
	+ if (has_char_array) {
	+ printh("\t\tbzero(&__head, sizeof(__head)); \\");
	+ }
	+ # Assign head fields (all fields except the varlen one).
	+ for (j = 1; j < field_count; j++) {
	+ field_name = structs[struct_idx, "field", j, "name"];
	+ field_type = structs[struct_idx, "field", j, "type"];
	+ if (match(field_type, /^char\[[0-9]+\]$/)) {
	+ printh("\t\tstrncpy(__head." field_name ", (__" field_name ") ? (__" field_name ") : \"\", sizeof(__head." field_name ") - 1); \\");
	+ printh("\t\t__head." field_name "[sizeof(__head." field_name ") - 1] = '\\0'; \\");
	+ } else if (field_name == varlen_count_name) {
	+ # Overwrite the count with the clamped value so
	+ # the wire layout matches what we actually pack.
	+ printh("\t\t__head." field_name " = __n; \\");
	+ } else {
	+ printh("\t\t__head." field_name " = (__" field_name "); \\");
	+ }
	+ }
	+ printh("\t\t__iov[0].iov_base = (void *)&__head; \\");
	+ printh("\t\t__iov[0].iov_len = sizeof(__head); \\");
	+ printh("\t\t__iov[1].iov_base = __DECONST(void *, (__" varlen_field_name ")); \\");
	+ printh("\t\t__iov[1].iov_len = __n * sizeof(" varlen_elem_type "); \\");
	+ printh("\t\teventlog_event_write_gather(__session, " events[i, "id"] ", " level_enum ", (" keyword_flags "), __iov, 2); \\");
	+ } else if (has_char_array) {
	+ printh("\t\tstruct " struct_type_name " __evt; \\");
	+ printh("\t\tbzero(&__evt, sizeof(__evt)); \\");
	+ for (j = 1; j <= field_count; j++) {
	+ field_name = structs[struct_idx, "field", j, "name"];
	+ field_type = structs[struct_idx, "field", j, "type"];
	+ if (match(field_type, /^char\[[0-9]+\]$/)) {
	+ printh("\t\tstrncpy(__evt." field_name ", (__" field_name ") ? (__" field_name ") : \"\", sizeof(__evt." field_name ") - 1); \\");
	+ printh("\t\t__evt." field_name "[sizeof(__evt." field_name ") - 1] = '\\0'; \\");
	+ } else {
	+ printh("\t\t__evt." field_name " = (__" field_name "); \\");
	+ }
	+ }
	+ printh("\t\teventlog_event_write(__session, " events[i, "id"] ", " level_enum ", (" keyword_flags "), &__evt, sizeof(__evt)); \\");
	+ } else {
	+ printh("\t\tstruct " struct_type_name " __evt = { \\");
	+ for (j = 1; j <= field_count; j++) {
	+ field_name = structs[struct_idx, "field", j, "name"];
	+ if (j < field_count) {
	+ printh("\t\t\t." field_name " = (__" field_name "), \\");
	+ } else {
	+ printh("\t\t\t." field_name " = (__" field_name ") \\");
	+ }
	+ }
	+ printh("\t\t}; \\");
	+ printh("\t\teventlog_event_write(__session, " events[i, "id"] ", " level_enum ", (" keyword_flags "), &__evt, sizeof(__evt)); \\");
	+ }
	+ printh("\t} while (0)");
	+ printh("");
	+
	+ # Generate _LOG macro (checks enabled and calls _LOG_ALWAYS)
	+ # Use same __ prefixed parameters, pass directly to _LOG_ALWAYS
	+ printh("/* " param_comment " */");
	+ printh("#define " provider_upper "_EVENTLOG_" event_name "_LOG(" param_list ") \\");
	+ printh("\tdo { \\");
	+ printh("\t\tif (" provider_upper "_EVENTLOG_" event_name "_ENABLED(__session)) { \\");
	+ printh("\t\t\t" provider_upper "_EVENTLOG_" event_name "_LOG_ALWAYS(" param_list "); \\");
	+ printh("\t\t} \\");
	+ printh("\t} while (0)");
	+ printh("");
	+ # Generate format string constant - for producer mode, just store the original format string
	+ # The consumer mode will convert %N placeholders to printf format specifiers
	+ printh("#define " provider_upper "_EVENTLOG_" event_name "_FORMAT \"" events[i, "format"] "\"");
	+ printh("");
	+ }
	+ }
	+
	+ # SESSION_END/SESSION_CREATE use fixed IDs from eventlog.h - no producer defines needed
	+ }
	+
	+ # Generate enum/flag lookup functions for userland (consumer mode only)
	+ if (mode == "consumer") {
	+ # SESSION_END/SESSION_CREATE use fixed EVENTLOG_SESSION_END_ID, EVENTLOG_SESSION_CREATE_ID from eventlog.h
	+ printh("#include <stdio.h>");
	+ printh("#include <string.h>");
	+ # Check if we need arpa/inet.h and sys/socket.h for IP address formatting or ntohs
	+ needs_inet_header = 0;
	+ for (i = 1; i <= nstructs; i++) {
	+ field_count = structs[i, "field_count"];
	+ for (j = 1; j <= field_count; j++) {
	+ field_type = structs[i, "field", j, "type"];
	+ if (field_type == "in_addr_t" \|\| field_type == "struct in_addr" \|\| field_type == "in6_addr_t" \|\| field_type == "struct in6_addr") {
	+ needs_inet_header = 1;
	+ break;
	+ }
	+ if (structs[i, "field", j, "ntohs"] == 1) {
	+ needs_inet_header = 1;
	+ break;
	+ }
	+ }
	+ if (needs_inet_header)
	+ break;
	+ }
	+ if (needs_inet_header) {
	+ printh("#include <sys/socket.h>");
	+ printh("#include <arpa/inet.h>");
	+ printh("#include <netinet/in.h>");
	+ }
	+ printh("");
	+ printh("/*");
	+ printh(" * Format string constants");
	+ printh(" */");
	+ printh("");
	+ # Format string constants are generated in the formatting function below
	+ # where %N placeholders are converted to printf format specifiers
	+ printh("");
	+ printh("/*");
	+ printh(" * Enum and flag lookup functions");
	+ printh(" * These functions convert numeric enum/flag values to strings");
	+ printh(" */");
	+ printh("");
	+
	+ # Generate enum lookup functions
	+ for (i = 1; i <= nenums; i++) {
	+ enum_name = enums[i, "name"];
	+ value_count = enums[i, "value_count"];
	+ printh("/*");
	+ printh(" * Lookup enum value for " enum_name);
	+ printh(" * Returns string representation or NULL if not found");
	+ printh(" */");
	+ printh("static inline const char *");
	+ printh(provider_lower "_eventlog_enum_" tolower(enum_name) "_to_string(uint32_t value)");
	+ printh("{");
	+ printh("\tswitch (value) {");
	+ for (j = 1; j <= value_count; j++) {
	+ value_num = enums[i, "value", j, "num"];
	+ value_name = enums[i, "value", j, "name"];
	+ printh("\tcase " value_num ":");
	+ printh("\t\treturn \"" value_name "\";");
	+ }
	+ printh("\tdefault:");
	+ printh("\t\treturn NULL;");
	+ printh("\t}");
	+ printh("}");
	+ printh("");
	+ }
	+
	+ # Generate flag lookup functions
	+ for (i = 1; i <= nflags; i++) {
	+ flag_name = flags[i, "name"];
	+ value_count = flags[i, "value_count"];
	+ printh("/*");
	+ printh(" * Lookup flag value for " flag_name);
	+ printh(" * Returns string representation of combined flags or NULL if empty");
	+ printh(" * Format: \"FLAG1\|FLAG2\|...\"");
	+ printh(" */");
	+ printh("static inline int");
	+ printh(provider_lower "_eventlog_flag_" tolower(flag_name) "_to_string(uint32_t value, char *buf, size_t bufsize)");
	+ printh("{");
	+ printh("\tint len = 0;");
	+ printh("\tint first = 1;");
	+ printh("");
	+ printh("\tif (buf == NULL \|\| bufsize == 0)");
	+ printh("\t\treturn -1;");
	+ printh("");
	+ printh("\tbuf[0] = '\\0';");
	+ printh("");
	+ printh("\tif (value == 0)");
	+ printh("\t\treturn 0;");
	+ printh("");
	+ # Generate flag bit checks
	+ for (j = 1; j <= value_count; j++) {
	+ value_num = flags[i, "value", j, "num"];
	+ value_name = flags[i, "value", j, "name"];
	+ printh("\tif (value & " value_num ") {");
	+ printh("\t\tif (!first && len < (int)bufsize - 1) {");
	+ printh("\t\t\tbuf[len++] = '\|';");
	+ printh("\t\t}");
	+ printh("\t\tfirst = 0;");
	+ printh("\t\tif (len < (int)bufsize - 1) {");
	+ printh("\t\t\tint n = snprintf(buf + len, bufsize - len, \"" value_name "\");");
	+ printh("\t\t\tif (n > 0 && n < (int)(bufsize - len))");
	+ printh("\t\t\t\tlen += n;");
	+ printh("\t\t}");
	+ printh("\t}");
	+ }
	+ printh("");
	+ printh("\tbuf[len] = '\\0';");
	+ printh("\treturn len;");
	+ printh("}");
	+ printh("");
	+ }
	+
	+ # Generate keyword name-to-bitmask lookup function
	+ printh("/*");
	+ printh(" * Convert a keyword name string to its bitmask value.");
	+ printh(" * Returns the keyword bitmask, or 0 if the name is not recognized.");
	+ printh(" */");
	+ printh("static inline uint32_t");
	+ printh(provider_lower "_eventlog_keyword_from_string(const char *name)");
	+ printh("{");
	+ for (i = 1; i <= nkeywords; i++) {
	+ kw_name = keywords[i, "name"];
	+ # Resolve the define value we already emitted
	+ define_name = toupper(provider) "_EVENTLOG_KEYWORD_" kw_name;
	+ printh("\tif (strcasecmp(name, \"" kw_name "\") == 0)");
	+ printh("\t\treturn (" define_name ");");
	+ }
	+ printh("\treturn (0);");
	+ printh("}");
	+ printh("");
	+
	+ # Generate formatting functions for userland (elog utility)
	+ printh("/*");
	+ printh(" * Userland formatting functions for event log parsing");
	+ printh(" * These functions format event data into human-readable strings");
	+ printh(" */");
	+ printh("");
	+
	+
	+ # Generate per-event formatting functions
	+ for (i = 1; i <= nevents; i++) {
	+ event_name = events[i, "name"];
	+ struct_name = events[i, "struct"];
	+ format_str = events[i, "format"];
	+ event_id = events[i, "id"];
	+
	+ if (struct_name == "NONE") {
	+ # No-payload event: format function takes no evt argument
	+ printf_format_escaped = format_str;
	+ gsub(/\\/, "\\\\", printf_format_escaped);
	+ gsub(/"/, "\\\"", printf_format_escaped);
	+ printh("#define " provider_upper "_EVENTLOG_" event_name "_FORMAT \"" printf_format_escaped "\"");
	+ printh("");
	+ printh("/*");
	+ printh(" * Format " provider_upper " " event_name " event to string");
	+ printh(" * Returns number of characters written (excluding null terminator),");
	+ printh(" * or -1 on error");
	+ printh(" */");
	+ printh("static inline int");
	+ printh(provider_lower "_eventlog_format_" tolower(event_name) "(char *buf, size_t bufsize)");
	+ printh("{");
	+ printh("\treturn snprintf(buf, bufsize, " provider_upper "_EVENTLOG_" event_name "_FORMAT);");
	+ printh("}");
	+ printh("");
	+ continue;
	+ }
	+
	+ # Find the struct definition
	+ struct_idx = 0;
	+ for (j = 1; j <= nstructs; j++) {
	+ if (structs[j, "name"] == struct_name) {
	+ struct_idx = j;
	+ break;
	+ }
	+ }
	+
	+ if (struct_idx == 0) {
	+ die("Struct " struct_name " not found for event " event_name);
	+ }
	+
	+ struct_type_name = provider_lower "_eventlog_" tolower(struct_name);
	+ field_count = structs[struct_idx, "field_count"];
	+
	+ # Parse format string to find positional placeholders (%1, %2, etc.)
	+ # and build mapping from placeholder index to field index
	+ placeholder_count = 0;
	+ delete placeholder_to_field;
	+ # Extract all %N placeholders from format string
	+ format_copy = format_str;
	+ while (match(format_copy, /%[0-9]+/)) {
	+ placeholder_num = substr(format_copy, RSTART + 1, RLENGTH - 1) + 0; # Extract number, convert to int
	+ if (placeholder_num > 0 && placeholder_num <= field_count) {
	+ # Varlen fields cannot be referenced from a format string --
	+ # there is no single printf specifier for a variable-length
	+ # array. Use the generated accessor helper at runtime instead.
	+ if (structs[struct_idx, "field", placeholder_num, "is_varlen"] == 1) {
	+ die("Event " event_name " format references varlen field (%s); use the generated accessor helper instead", \
	+ structs[struct_idx, "field", placeholder_num, "name"]);
	+ }
	+ placeholder_count++;
	+ placeholder_to_field[placeholder_count] = placeholder_num;
	+ } else {
	+ die("Invalid placeholder %" placeholder_num " in format string for event " event_name " (field count is " field_count ")");
	+ }
	+ format_copy = substr(format_copy, RSTART + RLENGTH);
	+ }
	+
	+ # If no placeholders found, assume old-style format (all fields in order)
	+ # but only when the format string contains % (e.g. "Value: %1"). Events with
	+ # no format args (e.g. "Timer canceled", "Session ended") use no-args path.
	+ # Skip varlen fields -- they cannot be printf-formatted inline.
	+ if (placeholder_count == 0 && index(format_str, "%") > 0 && event_name != "SESSION_END") {
	+ for (j = 1; j <= field_count; j++) {
	+ if (structs[struct_idx, "field", j, "is_varlen"] == 1)
	+ continue;
	+ placeholder_count++;
	+ placeholder_to_field[placeholder_count] = j;
	+ }
	+ }
	+
	+ # Build printf format string by replacing %N placeholders with actual format specifiers
	+ # (Do this early so we can generate the format constant before the function)
	+ printf_format = format_str;
	+ # Process placeholders in reverse order to avoid replacing parts of already-replaced placeholders
	+ # Build a sorted list of unique field indices
	+ delete field_indices;
	+ field_idx_count = 0;
	+ for (j = 1; j <= placeholder_count; j++) {
	+ field_idx = placeholder_to_field[j];
	+ found = 0;
	+ for (k = 1; k <= field_idx_count; k++) {
	+ if (field_indices[k] == field_idx) {
	+ found = 1;
	+ break;
	+ }
	+ }
	+ if (!found) {
	+ field_idx_count++;
	+ field_indices[field_idx_count] = field_idx;
	+ }
	+ }
	+ # Sort field indices in descending order for replacement
	+ for (j = 1; j < field_idx_count; j++) {
	+ for (k = j + 1; k <= field_idx_count; k++) {
	+ if (field_indices[j] < field_indices[k]) {
	+ tmp = field_indices[j];
	+ field_indices[j] = field_indices[k];
	+ field_indices[k] = tmp;
	+ }
	+ }
	+ }
	+ # Replace placeholders with format specifiers (largest first to avoid partial matches)
	+ for (j = 1; j <= field_idx_count; j++) {
	+ field_idx = field_indices[j];
	+ field_type = structs[struct_idx, "field", field_idx, "type"];
	+ enum_type = structs[struct_idx, "field", field_idx, "enum_type"];
	+ flag_type = structs[struct_idx, "field", field_idx, "flag_type"];
	+ hex_format = structs[struct_idx, "field", field_idx, "hex_format"];
	+ format_spec = get_printf_format(field_type, enum_type, flag_type, hex_format);
	+ # Add "0x" prefix for hex fields
	+ if (hex_format) {
	+ format_spec = "0x" format_spec;
	+ }
	+ # Replace %N with the format specifier
	+ placeholder_str = "%" field_idx;
	+ gsub(placeholder_str, format_spec, printf_format);
	+ }
	+
	+ # Generate the format string constant with printf specifiers (before the function)
	+ # Escape quotes and backslashes in the format string for C string literal
	+ # Note: % signs are preserved as-is (they're part of printf format specifiers)
	+ printf_format_escaped = printf_format;
	+ gsub(/\\/, "\\\\", printf_format_escaped); # Escape backslashes
	+ gsub(/"/, "\\\"", printf_format_escaped); # Escape quotes
	+ printh("#define " provider_upper "_EVENTLOG_" event_name "_FORMAT \"" printf_format_escaped "\"");
	+ printh("");
	+
	+ # Generate formatting function for this event
	+ printh("/*");
	+ printh(" * Format " provider_upper " " event_name " event to string");
	+ printh(" * Returns number of characters written (excluding null terminator),");
	+ printh(" * or -1 on error");
	+ printh(" */");
	+ printh("static inline int");
	+ printh(provider_lower "_eventlog_format_" tolower(event_name) "(const struct " struct_type_name " evt, size_t payload_size, char buf, size_t bufsize)");
	+ printh("{");
	+ printh("\tint ret;");
	+ printh("\t(void)payload_size; /* may be unused for fixed-size events */");
	+ printh("");
	+
	+ # Determine which fields are actually used in the format string
	+ delete fields_used;
	+ for (j = 1; j <= placeholder_count; j++) {
	+ field_idx = placeholder_to_field[j];
	+ fields_used[field_idx] = 1;
	+ }
	+
	+ # Generate enum/flag/IP lookups only for fields that are used
	+ needs_lookup = 0;
	+ for (j = 1; j <= field_count; j++) {
	+ if (!fields_used[j])
	+ continue;
	+ field_type = structs[struct_idx, "field", j, "type"];
	+ if (structs[struct_idx, "field", j, "enum_type"] != "" \|\| structs[struct_idx, "field", j, "flag_type"] != "") {
	+ needs_lookup = 1;
	+ }
	+ if (field_type == "in_addr_t" \|\| field_type == "struct in_addr" \|\| field_type == "in6_addr_t" \|\| field_type == "struct in6_addr") {
	+ needs_lookup = 1;
	+ }
	+ }
	+
	+ # Generate enum/flag/IP lookups and convert to strings (only for used fields)
	+ for (j = 1; j <= field_count; j++) {
	+ if (!fields_used[j])
	+ continue;
	+ field_name = structs[struct_idx, "field", j, "name"];
	+ field_type = structs[struct_idx, "field", j, "type"];
	+ enum_type = structs[struct_idx, "field", j, "enum_type"];
	+ flag_type = structs[struct_idx, "field", j, "flag_type"];
	+
	+ if (enum_type != "") {
	+ printh("\tconst char *" field_name "_str = " provider_lower "_eventlog_enum_" tolower(enum_type) "_to_string(evt->" field_name ");");
	+ printh("\tchar " field_name "_val[32];");
	+ printh("\tif (" field_name "_str == NULL)");
	+ printh("\t\tsnprintf(" field_name "_val, sizeof(" field_name "_val), \"%u\", evt->" field_name ");");
	+ } else if (flag_type != "") {
	+ printh("\tchar " field_name "_buf[128];");
	+ printh("\tint " field_name "_len = " provider_lower "_eventlog_flag_" tolower(flag_type) "_to_string(evt->" field_name ", " field_name "_buf, sizeof(" field_name "_buf));");
	+ printh("\tchar " field_name "_val[32];");
	+ printh("\tif (" field_name "_len == 0)");
	+ printh("\t\tsnprintf(" field_name "_val, sizeof(" field_name "_val), \"%u\", evt->" field_name ");");
	+ } else if (field_type == "in_addr_t" \|\| field_type == "struct in_addr") {
	+ printh("\tchar " field_name "_str[INET_ADDRSTRLEN];");
	+ printh("\tif (inet_ntop(AF_INET, &evt->" field_name ", " field_name "_str, sizeof(" field_name "_str)) == NULL)");
	+ printh("\t\tstrcpy(" field_name "_str, \"<invalid>\");");
	+ } else if (field_type == "in6_addr_t" \|\| field_type == "struct in6_addr") {
	+ printh("\tchar " field_name "_str[INET6_ADDRSTRLEN];");
	+ printh("\tif (inet_ntop(AF_INET6, &evt->" field_name ", " field_name "_str, sizeof(" field_name "_str)) == NULL)");
	+ printh("\t\tstrcpy(" field_name "_str, \"<invalid>\");");
	+ }
	+ }
	+
	+ if (needs_lookup) {
	+ printh("");
	+ }
	+
	+ # Note: Format string constant was already generated above before the function
	+ # printf_format variable is already set with the converted format string
	+
	+ # Build argument list in the order placeholders appear in format string
	+ arg_list = "";
	+ for (j = 1; j <= placeholder_count; j++) {
	+ field_idx = placeholder_to_field[j];
	+ field_name = structs[struct_idx, "field", field_idx, "name"];
	+ field_type = structs[struct_idx, "field", field_idx, "type"];
	+ enum_type = structs[struct_idx, "field", field_idx, "enum_type"];
	+ flag_type = structs[struct_idx, "field", field_idx, "flag_type"];
	+
	+ if (arg_list != "")
	+ arg_list = arg_list ", ";
	+
	+ if (enum_type != "") {
	+ # Use enum string if available, otherwise use formatted number
	+ arg_list = arg_list "(" field_name "_str != NULL ? " field_name "_str : " field_name "_val)";
	+ } else if (flag_type != "") {
	+ # Use flag string if available, otherwise use formatted number
	+ arg_list = arg_list "(" field_name "_len > 0 ? " field_name "_buf : " field_name "_val)";
	+ } else if (field_type == "in_addr_t" \|\| field_type == "struct in_addr") {
	+ # Use formatted IP address string
	+ arg_list = arg_list field_name "_str";
	+ } else if (field_type == "in6_addr_t" \|\| field_type == "struct in6_addr") {
	+ # Use formatted IPv6 address string (works with or without INET6)
	+ arg_list = arg_list field_name "_str";
	+ } else if (structs[struct_idx, "field", field_idx, "ntohs"] == 1) {
	+ # Network-to-host byte order conversion
	+ arg_list = arg_list "ntohs(evt->" field_name ")";
	+ } else {
	+ # Direct field access
	+ arg_list = arg_list "evt->" field_name;
	+ }
	+ }
	+
	+ # Handle empty format strings (no placeholders)
	+ # Note: Format string constant was already generated above before the function
	+ if (placeholder_count > 0) {
	+ printh("\tret = snprintf(buf, bufsize, " provider_upper "_EVENTLOG_" event_name "_FORMAT, " arg_list ");");
	+ } else {
	+ printh("\t(void)evt; /* Unused for empty format */");
	+ printh("\tret = snprintf(buf, bufsize, " provider_upper "_EVENTLOG_" event_name "_FORMAT);");
	+ }
	+ printh("\treturn ret;");
	+ printh("}");
	+ printh("");
	+ }
	+
	+ # Generate generic formatting function that formats payload based on event ID
	+ printh("/*");
	+ printh(" * Format an event payload to string");
	+ printh(" * payload: Pointer to event payload data");
	+ printh(" * payload_size: Size of the payload");
	+ printh(" * event_id: Event ID to determine which formatter to use");
	+ printh(" * buf: Output buffer");
	+ printh(" * bufsize: Size of output buffer");
	+ printh(" * Returns number of characters written, or -1 on error");
	+ printh(" */");
	+ printh("static inline int");
	+ printh(provider_lower "_eventlog_format_payload(const void payload, size_t payload_size, uint32_t event_id, char buf, size_t bufsize)");
	+ printh("{");
	+ printh("\t(void)payload_size; /* May be unused depending on event */");
	+ printh("\tif (buf == NULL \|\| bufsize == 0)");
	+ printh("\t\treturn -1;");
	+ printh("\tif (payload == NULL && payload_size > 0)");
	+ printh("\t\treturn -1;");
	+ printh("\t");
	+ printh("\tswitch (event_id) {");
	+
	+ for (i = 1; i <= nevents; i++) {
	+ event_name = events[i, "name"];
	+ struct_name = events[i, "struct"];
	+ event_id = events[i, "id"];
	+
	+ printh("\tcase " event_id ":");
	+ if (struct_name == "NONE") {
	+ printh("\t\treturn " provider_lower "_eventlog_format_" tolower(event_name) "(buf, bufsize);");
	+ } else {
	+ struct_type_name = provider_lower "_eventlog_" tolower(struct_name);
	+ if (event_name == "SESSION_CREATE") {
	+ printh("\t\tif (payload_size == 0)");
	+ printh("\t\t\treturn snprintf(buf, bufsize, \"Session created\");");
	+ }
	+ printh("\t\treturn " provider_lower "_eventlog_format_" tolower(event_name) "((const struct " struct_type_name " *)payload, payload_size, buf, bufsize);");
	+ }
	+ }
	+
	+ printh("\tdefault:");
	+ printh("\t\treturn snprintf(buf, bufsize, \"[UNKNOWN_EVENT_ID:%u]\", event_id);");
	+ printh("\t}");
	+ printh("}");
	+ printh("");
	+
	+ # Generate event ID to name lookup function
	+ printh("/*");
	+ printh(" * Map event ID to event name string");
	+ printh(" * Returns event name (e.g. \"IN\", \"OUT\") or NULL if unknown");
	+ printh(" */");
	+ printh("static inline const char *");
	+ printh(provider_lower "_eventlog_event_id_to_name(uint32_t event_id)");
	+ printh("{");
	+ printh("\tswitch (event_id) {");
	+
	+ for (i = 1; i <= nevents; i++) {
	+ event_name = events[i, "name"];
	+ event_id = events[i, "id"];
	+ printh("\tcase " event_id ": return \"" event_name "\";");
	+ }
	+
	+ printh("\tdefault: return NULL;");
	+ printh("\t}");
	+ printh("}");
	+ printh("");
	+ }
	+
	+ printh("#endif /* _" provider_upper "_EVENTLOG_H_ */");
	+}
	+
	diff --git a/include/eventlog/test_eventlog_schema.src b/include/eventlog/test_eventlog_schema.src
	new file mode 100644
	--- /dev/null
	+++ b/include/eventlog/test_eventlog_schema.src
	@@ -0,0 +1,59 @@
	+/*
	+ * Copyright (c) 2026 Netflix, Inc.
	+ *
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ */
	+
	+/*
	+ * This file defines the schema for test event logging.
	+ * It is processed by eventlog_gen.awk to generate the event log header
	+ * for testing purposes.
	+ */
	+
	+PROVIDER test
	+
	+KEYWORD BASIC 1
	+KEYWORD ADVANCED 2
	+KEYWORD COMPLEX 4
	+
	+ENUM test_status
	+ 0:INIT
	+ 1:RUNNING
	+ 2:SUCCESS
	+ 3:FAILED
	+
	+FLAG test_flags
	+ 0x01:FLAG_A
	+ 0x02:FLAG_B
	+ 0x04:FLAG_C
	+ 0x08:FLAG_D
	+
	+STRUCT SIMPLE_EVENT
	+ value:uint32_t
	+STRUCT STATUS_EVENT
	+ id:uint64_t
	+ status:uint8_t:enum_test_status
	+STRUCT FLAGS_EVENT
	+ id:uint64_t
	+ flags:uint32_t:flag_test_flags
	+STRUCT COMPLEX_EVENT
	+ id:uint64_t
	+ value:uint32_t
	+ status:uint8_t:enum_test_status
	+ flags:uint32_t:flag_test_flags
	+ counter:int32_t
	+STRUCT VARLEN_EVENT
	+ id:uint64_t
	+ count:uint8_t
	+ values:uint64_t[count:8]
	+
	+EVENT SIMPLE_EVENT 0 INFO BASIC SIMPLE_EVENT
	+ "Simple event: value=%u"
	+EVENT STATUS_EVENT 1 INFO BASIC STATUS_EVENT
	+ "[%lu] Status event: status=%s"
	+EVENT FLAGS_EVENT 2 VERBOSE ADVANCED FLAGS_EVENT
	+ "[%lu] Flags event: flags=%s"
	+EVENT COMPLEX_EVENT 3 WARN COMPLEX COMPLEX_EVENT
	+ "[%lu] Complex event: value=%u status=%s flags=%s counter=%d"
	+EVENT VARLEN_EVENT 4 INFO BASIC VARLEN_EVENT
	+ "[%1] Varlen event: count=%2"
	diff --git a/share/man/man5/Makefile b/share/man/man5/Makefile
	--- a/share/man/man5/Makefile
	+++ b/share/man/man5/Makefile
	@@ -9,6 +9,7 @@
	dir.5 \
	disktab.5 \
	elf.5 \
	+ elog.5 \
	ethers.5 \
	eui64.5 \
	fbtab.5 \
	diff --git a/share/man/man5/elog.5 b/share/man/man5/elog.5
	new file mode 100644
	--- /dev/null
	+++ b/share/man/man5/elog.5
	@@ -0,0 +1,229 @@
	+.\"
	+.\" Copyright (c) 2026 Netflix, Inc.
	+.\"
	+.\" SPDX-License-Identifier: BSD-2-Clause
	+.\"
	+.Dd March 19, 2026
	+.Dt ELOG 5
	+.Os
	+.Sh NAME
	+.Nm elog
	+.Nd eventlog binary capture file format
	+.Sh DESCRIPTION
	+An
	+.Nm
	+file stores a captured sequence of kernel events produced by the
	+.Xr eventlog 9
	+framework.
	+Files are created by the
	+.Xr elog 1
	+utility using the
	+.Fl o
	+option and can be read back with the
	+.Fl r
	+option.
	+.Pp
	+All multi-byte integer fields are in host-native byte order
	+.Pq little-endian on amd64 and aarch64 .
	+.Pp
	+An
	+.Nm
	+file consists of three contiguous sections:
	+.Bd -literal -offset indent
	+[file header (40 bytes)]
	+[provider table (4 + N*34 bytes)]
	+[event stream]
	+.Ed
	+.Ss File Header
	+The file header is 40 bytes, packed with no padding:
	+.Bl -column "Offset" "Size" "uint64_t" "dropped_events" -offset indent
	+.It Sy Offset Ta Sy Size Ta Sy Type Ta Sy Field
	+.It 0 Ta 4 Ta Vt char[4] Ta Va magic
	+.It 4 Ta 4 Ta Vt uint32_t Ta Va version
	+.It 8 Ta 8 Ta Vt uint64_t Ta Va capture_start
	+.It 16 Ta 8 Ta Vt uint64_t Ta Va start_utc_us
	+.It 24 Ta 8 Ta Vt uint64_t Ta Va event_count
	+.It 32 Ta 8 Ta Vt uint64_t Ta Va dropped_events
	+.El
	+.Pp
	+The fields are:
	+.Bl -tag -width indent
	+.It Va magic
	+The four ASCII bytes
	+.Dq ELOG
	+.Pq Li 0x45 0x4C 0x4F 0x47 .
	+A consumer must verify this field before proceeding.
	+.It Va version
	+Format version number.
	+The current version is 1.
	+Unknown versions should be rejected.
	+.It Va capture_start
	+Timestamp of the first captured event, in microseconds since boot.
	+.It Va start_utc_us
	+UTC wall-clock time at capture start, in microseconds since the Unix epoch.
	+Used together with
	+.Va capture_start
	+to convert event timestamps to UTC:
	+.Bd -literal -offset indent
	+utc_us = start_utc_us + (event.timestamp - capture_start)
	+.Ed
	+.It Va event_count
	+Total number of events in the file.
	+Written as 0 initially and updated by seeking back to the header when
	+capture ends.
	+May be 0 if the capturing tool was killed.
	+.It Va dropped_events
	+Number of events the kernel dropped due to full buffers during capture.
	+Also updated when capture ends.
	+.El
	+.Pp
	+The C structure is:
	+.Bd -literal -offset indent
	+struct elog_binary_header {
	+ char magic[4];
	+ uint32_t version;
	+ uint64_t capture_start;
	+ uint64_t start_utc_us;
	+ uint64_t event_count;
	+ uint64_t dropped_events;
	+} __packed;
	+.Ed
	+.Ss Provider Table
	+Immediately after the file header, the provider table maps numeric
	+.Va provider_id
	+values found in event headers to human-readable provider names.
	+.Pp
	+The table starts with a 4-byte count:
	+.Bl -column "Offset" "Size" "uint32_t" -offset indent
	+.It Sy Offset Ta Sy Size Ta Sy Type Ta Sy Field
	+.It 0 Ta 4 Ta Vt uint32_t Ta Va count
	+.El
	+.Pp
	+This is followed by
	+.Va count
	+entries, each 34 bytes and packed:
	+.Bl -column "Offset" "Size" "uint16_t" "provider_id" -offset indent
	+.It Sy Offset Ta Sy Size Ta Sy Type Ta Sy Field
	+.It 0 Ta 2 Ta Vt uint16_t Ta Va provider_id
	+.It 2 Ta 32 Ta Vt char[32] Ta Va name
	+.El
	+.Pp
	+The
	+.Va name
	+field is null-terminated and zero-padded to 32 bytes.
	+Multiple entries may share the same name when multiple kernel subsystems
	+register providers under the same name
	+.Pq see Xr eventlog 9 .
	+The maximum number of entries is 32
	+.Pq Dv EVENTLOG_MAX_PROVIDERS .
	+.Pp
	+The total provider table size is
	+.Li 4 + count * 34
	+bytes.
	+.Pp
	+The C structure for each entry is:
	+.Bd -literal -offset indent
	+struct eventlog_provider_info {
	+ uint16_t provider_id;
	+ char name[EVENTLOG_PROVIDER_NAME_MAX];
	+} __packed;
	+.Ed
	+.Ss Event Stream
	+The remainder of the file is a packed sequence of events in timestamp
	+order.
	+Each event consists of a 32-byte header followed by a variable-length
	+payload:
	+.Bd -literal -offset indent
	+[header (32 bytes)][payload (0+ bytes)]
	+.Ed
	+.Pp
	+Events are packed back-to-back with no inter-event padding.
	+The event header layout is:
	+.Bl -column "Offset" "Size" "uint64_t" "event_length" -offset indent
	+.It Sy Offset Ta Sy Size Ta Sy Type Ta Sy Field
	+.It 0 Ta 2 Ta Vt uint16_t Ta Va event_length
	+.It 2 Ta 2 Ta Vt uint16_t Ta Va cpu
	+.It 4 Ta 2 Ta Vt uint16_t Ta Va provider_id
	+.It 6 Ta 2 Ta Vt uint16_t Ta (reserved)
	+.It 8 Ta 8 Ta Vt uint64_t Ta Va timestamp
	+.It 16 Ta 8 Ta Vt uint64_t Ta Va session_id
	+.It 24 Ta 4 Ta Vt uint32_t Ta Va event_id
	+.It 28 Ta 4 Ta Vt int32_t Ta Va thread_id
	+.El
	+.Pp
	+The
	+.Va event_length
	+field gives the total event size in bytes, including the header.
	+The minimum value is 32
	+.Pq header only, no payload ;
	+the maximum is 65535.
	+A parser advances to the next event by adding
	+.Va event_length
	+to the current event's offset.
	+.Pp
	+The
	+.Va provider_id
	+maps to a provider name via the provider table.
	+The
	+.Va timestamp
	+is in microseconds since boot; convert to UTC using the formula
	+described in the file header section.
	+The
	+.Va session_id
	+is provider-defined.
	+The reserved field at offset 6 is written as zero and must be ignored.
	+.Pp
	+Two
	+.Va event_id
	+values are reserved across all providers:
	+.Bl -column "EVENTLOG_SESSION_CREATE_ID" "0xFFFFFFFE" -offset indent
	+.It Sy Constant Ta Sy Value Ta Sy Meaning
	+.It Dv EVENTLOG_SESSION_CREATE_ID Ta Li 0xFFFFFFFE Ta Session created
	+.It Dv EVENTLOG_SESSION_END_ID Ta Li 0xFFFFFFFF Ta Session destroyed
	+.El
	+.Pp
	+The
	+.Dv EVENTLOG_SESSION_END_ID
	+event has an empty payload
	+.Pq 0 bytes .
	+All other payload formats are provider-specific and defined by schema files.
	+The payload size is
	+.Va event_length
	+minus 32.
	+.Pp
	+The event stream may end with an incomplete event at EOF if the capture
	+was interrupted.
	+Parsers should stop when fewer than 32 bytes remain or when
	+.Va event_length
	+exceeds the remaining data.
	+.Ss Compressed Files
	+The
	+.Xr elog 1
	+utility also accepts
	+.Pa .gz Ns -compressed
	+files
	+.Pq detected by file extension .
	+The decompressed content has the same format described above.
	+.Sh EXAMPLES
	+Capture TCP events to a binary file:
	+.Bd -literal -offset indent
	+elog -c tcp -o /tmp/events.elog
	+.Ed
	+.Pp
	+Read back as formatted text:
	+.Bd -literal -offset indent
	+elog -r /tmp/events.elog
	+.Ed
	+.Pp
	+Read a compressed capture:
	+.Bd -literal -offset indent
	+elog -r /tmp/events.elog.gz
	+.Ed
	+.Sh SEE ALSO
	+.Xr elog 1 ,
	+.Xr eventlog 9
	+.Sh HISTORY
	+The
	+.Nm
	+file format first appeared in
	+.Fx 16.0 .
	diff --git a/share/man/man9/Makefile b/share/man/man9/Makefile
	--- a/share/man/man9/Makefile
	+++ b/share/man/man9/Makefile
	@@ -147,6 +147,7 @@
	ecn.9 \
	efirt.9 \
	epoch.9 \
	+ eventlog.9 \
	ether_gen_addr.9 \
	EVENTHANDLER.9 \
	eventtimers.9 \
	@@ -1123,6 +1124,30 @@
	epoch.9 epoch_call.9 \
	epoch.9 epoch_drain_callbacks.9 \
	epoch.9 in_epoch.9
	+MLINKS+=eventlog.9 eventlog_provider_create.9 \
	+ eventlog.9 eventlog_provider_destroy.9 \
	+ eventlog.9 eventlog_provider_get_level.9 \
	+ eventlog.9 eventlog_provider_get_keywords.9 \
	+ eventlog.9 eventlog_provider_get_default.9 \
	+ eventlog.9 eventlog_provider_set_default.9 \
	+ eventlog.9 eventlog_provider_get_sysctl_node.9 \
	+ eventlog.9 eventlog_provider_get_sysctl_ctx.9 \
	+ eventlog.9 eventlog_session_create.9 \
	+ eventlog.9 eventlog_session_destroy.9 \
	+ eventlog.9 eventlog_session_set_enabled.9 \
	+ eventlog.9 eventlog_session_is_enabled.9 \
	+ eventlog.9 eventlog_session_set_filter.9 \
	+ eventlog.9 eventlog_event_write.9 \
	+ eventlog.9 eventlog_event_write_at.9 \
	+ eventlog.9 eventlog_event_write_gather.9 \
	+ eventlog.9 eventlog_event_write_gather_at.9 \
	+ eventlog.9 eventlog_subscriber_create_device.9 \
	+ eventlog.9 eventlog_subscriber_create_callback.9 \
	+ eventlog.9 eventlog_subscriber_destroy.9 \
	+ eventlog.9 eventlog_subscriber_add_subscription.9 \
	+ eventlog.9 eventlog_subscriber_drain_dumps.9 \
	+ eventlog.9 eventlog_subscriber_read.9 \
	+ eventlog.9 eventlog_subscriber_get_stats.9
	MLINKS+=EVENTHANDLER.9 EVENTHANDLER_DECLARE.9 \
	EVENTHANDLER.9 EVENTHANDLER_DEFINE.9 \
	EVENTHANDLER.9 EVENTHANDLER_DEREGISTER.9 \
	diff --git a/share/man/man9/eventlog.9 b/share/man/man9/eventlog.9
	new file mode 100644
	--- /dev/null
	+++ b/share/man/man9/eventlog.9
	@@ -0,0 +1,1097 @@
	+.\"
	+.\" Copyright (c) 2026 Netflix, Inc.
	+.\"
	+.\" SPDX-License-Identifier: BSD-2-Clause
	+.\"
	+.Dd May 11, 2026
	+.Dt EVENTLOG 9
	+.Os
	+.Sh NAME
	+.Nm eventlog
	+.Nd subscription-based kernel event logging framework
	+.Sh SYNOPSIS
	+.In sys/eventlog.h
	+.Ft "struct eventlog_provider *"
	+.Fo eventlog_provider_create
	+.Fa "const char *name"
	+.Fa "eventlog_provider_dump_state_t dump_callback"
	+.Fa "void *dump_callback_arg"
	+.Fa "eventlog_default_changed_t default_changed"
	+.Fa "void *default_changed_arg"
	+.Fc
	+.Ft void
	+.Fo eventlog_provider_destroy
	+.Fa "struct eventlog_provider *provider"
	+.Fc
	+.Ft "enum eventlog_level"
	+.Fo eventlog_provider_get_level
	+.Fa "struct eventlog_provider *provider"
	+.Fc
	+.Ft uint32_t
	+.Fo eventlog_provider_get_keywords
	+.Fa "struct eventlog_provider *provider"
	+.Fc
	+.Ft int
	+.Fo eventlog_provider_get_default
	+.Fa "struct eventlog_provider *provider"
	+.Fc
	+.Ft void
	+.Fo eventlog_provider_set_default
	+.Fa "struct eventlog_provider *provider"
	+.Fa "int value"
	+.Fc
	+.Ft "struct sysctl_oid *"
	+.Fo eventlog_provider_get_sysctl_node
	+.Fa "struct eventlog_provider *provider"
	+.Fc
	+.Ft "struct sysctl_ctx_list *"
	+.Fo eventlog_provider_get_sysctl_ctx
	+.Fa "struct eventlog_provider *provider"
	+.Fc
	+.Ft "struct eventlog_session *"
	+.Fo eventlog_session_create
	+.Fa "struct eventlog_provider *provider"
	+.Fa "uint64_t session_id"
	+.Fa "bool waitok"
	+.Fa "void *create_payload"
	+.Fa "size_t create_payload_size"
	+.Fc
	+.Ft void
	+.Fo eventlog_session_destroy
	+.Fa "struct eventlog_session *session"
	+.Fc
	+.Ft void
	+.Fo eventlog_session_set_enabled
	+.Fa "struct eventlog_session *session"
	+.Fa "int enabled"
	+.Fc
	+.Ft int
	+.Fo eventlog_session_is_enabled
	+.Fa "struct eventlog_session *session"
	+.Fc
	+.Ft void
	+.Fo eventlog_session_set_filter
	+.Fa "struct eventlog_session *session"
	+.Fa "enum eventlog_level level"
	+.Fa "uint32_t keywords"
	+.Fc
	+.Ft void
	+.Fo eventlog_event_write
	+.Fa "struct eventlog_session *session"
	+.Fa "uint32_t id"
	+.Fa "enum eventlog_level level"
	+.Fa "uint32_t keywords"
	+.Fa "void *buffer"
	+.Fa "size_t length"
	+.Fc
	+.Ft void
	+.Fo eventlog_event_write_at
	+.Fa "struct eventlog_session *session"
	+.Fa "uint32_t id"
	+.Fa "enum eventlog_level level"
	+.Fa "uint32_t keywords"
	+.Fa "void *buffer"
	+.Fa "size_t length"
	+.Fa "uint64_t timestamp_us"
	+.Fc
	+.Ft void
	+.Fo eventlog_event_write_gather
	+.Fa "struct eventlog_session *session"
	+.Fa "uint32_t id"
	+.Fa "enum eventlog_level level"
	+.Fa "uint32_t keywords"
	+.Fa "const struct iovec *iov"
	+.Fa "int iovcnt"
	+.Fc
	+.Ft void
	+.Fo eventlog_event_write_gather_at
	+.Fa "struct eventlog_session *session"
	+.Fa "uint32_t id"
	+.Fa "enum eventlog_level level"
	+.Fa "uint32_t keywords"
	+.Fa "const struct iovec *iov"
	+.Fa "int iovcnt"
	+.Fa "uint64_t timestamp_us"
	+.Fc
	+.In sys/eventlog_subscriber.h
	+.Ft "struct eventlog_subscriber *"
	+.Fo eventlog_subscriber_create_device
	+.Fa "uint32_t buffer_size_per_cpu"
	+.Fc
	+.Ft "struct eventlog_subscriber *"
	+.Fo eventlog_subscriber_create_callback
	+.Fa "eventlog_callback_t callback"
	+.Fa "void *callback_arg"
	+.Fc
	+.Ft void
	+.Fo eventlog_subscriber_destroy
	+.Fa "struct eventlog_subscriber *subscriber"
	+.Fc
	+.Ft int
	+.Fo eventlog_subscriber_add_subscription
	+.Fa "struct eventlog_subscriber *subscriber"
	+.Fa "const char *provider_name"
	+.Fa "enum eventlog_level level"
	+.Fa "uint32_t keywords"
	+.Fc
	+.Ft void
	+.Fo eventlog_subscriber_drain_dumps
	+.Fa "struct eventlog_subscriber *subscriber"
	+.Fc
	+.Ft int
	+.Fo eventlog_subscriber_read
	+.Fa "struct eventlog_subscriber *subscriber"
	+.Fa "struct uio *uio"
	+.Fa "int flags"
	+.Fc
	+.Ft void
	+.Fo eventlog_subscriber_get_stats
	+.Fa "struct eventlog_subscriber *subscriber"
	+.Fa "struct eventlog_stats *stats"
	+.Fc
	+.Sh DESCRIPTION
	+The
	+.Nm
	+framework provides a subscription-based event logging system for the
	+.Fx
	+kernel.
	+It enables kernel subsystems
	+.Pq providers
	+to emit structured events that can be captured by user-space tools or
	+kernel callbacks
	+.Pq subscribers
	+with fine-grained filtering based on provider name, keyword bitmask, and
	+log level.
	+.Pp
	+A single character device,
	+.Pa /dev/eventlog ,
	+handles all event subscriptions and delivery.
	+Providers are only enabled when active subscribers exist, ensuring zero
	+overhead when no one is listening.
	+.Pp
	+The framework is designed around the following principles:
	+.Bl -bullet -compact
	+.It
	+Single system-wide device for all providers and subscribers.
	+.It
	+Subscription-based model with zero overhead when idle.
	+.It
	+Multi-provider support: multiple subsystems register under the same name
	+and subscriptions automatically cover all matching providers.
	+.It
	+Per-subscriber filtering by log level and keyword bitmask.
	+.It
	+Lock-free per-CPU double-buffering for device subscribers.
	+.It
	+Timestamp-ordered event delivery across CPUs.
	+.El
	+.Ss Log Levels
	+Events are classified by severity using the
	+.Vt "enum eventlog_level"
	+type.
	+Higher numeric values are more verbose:
	+.Bl -column "EVENTLOG_LEVEL_VERBOSE" "Value" -offset indent
	+.It Sy Constant Ta Sy Value
	+.It Dv EVENTLOG_LEVEL_NONE Ta 0
	+.It Dv EVENTLOG_LEVEL_ERROR Ta 1
	+.It Dv EVENTLOG_LEVEL_WARN Ta 2
	+.It Dv EVENTLOG_LEVEL_INFO Ta 3
	+.It Dv EVENTLOG_LEVEL_VERBOSE Ta 4
	+.It Dv EVENTLOG_LEVEL_TRACE Ta 5
	+.El
	+.Pp
	+Subscribing at a given level receives all events at that level and below
	+.Pq less verbose .
	+For example, subscribing at
	+.Dv EVENTLOG_LEVEL_VERBOSE
	+receives ERROR, WARN, INFO, and VERBOSE events but not TRACE.
	+.Ss Providers
	+A provider is a kernel subsystem that emits events.
	+Providers are created with
	+.Fn eventlog_provider_create ,
	+which registers the provider under a given
	+.Fa name
	+and assigns it a unique
	+.Va provider_id
	+.Pq 1-based .
	+The
	+.Va provider_id
	+is embedded in every event header.
	+.Pp
	+Multiple subsystems may register providers with the same name.
	+For example, both the default TCP stack and the RACK TCP stack register
	+providers named
	+.Dq tcp :
	+.Bd -literal -offset indent
	+/* Default TCP stack */
	+provider = eventlog_provider_create("tcp",
	+ tcp_eventlog_dump_state, NULL,
	+ tcp_eventlog_default_changed, NULL);
	+
	+/* RACK TCP stack */
	+provider = eventlog_provider_create("tcp",
	+ tcp_eventlog_dump_state, NULL,
	+ tcp_eventlog_default_changed, NULL);
	+.Ed
	+.Pp
	+When a subscriber subscribes by name, the subscription is applied to
	+.Em all
	+providers matching that name.
	+Each provider instance gets its own unique
	+.Va provider_id ,
	+allowing consumers to distinguish which instance emitted a given event.
	+.Pp
	+The
	+.Fn eventlog_provider_destroy
	+function unregisters a provider.
	+It synchronizes with the dump state lock to ensure no dump callback is
	+in-flight when the provider is freed.
	+.Pp
	+The
	+.Fn eventlog_provider_get_level
	+and
	+.Fn eventlog_provider_get_keywords
	+functions return the current aggregate level and keyword mask computed
	+from all active subscribers.
	+These are primarily useful for testing and debugging.
	+.Pp
	+The
	+.Fn eventlog_provider_get_default
	+function returns the current default enablement setting for a provider
	+.Pq 0 or 1 .
	+The
	+.Fn eventlog_provider_set_default
	+function sets this value programmatically without iterating existing sessions.
	+.Ss Shared Statistics
	+All providers sharing the same name share a single
	+.Vt eventlog_provider_stats
	+structure, which owns the
	+.Xr sysctl 9
	+counters and the
	+.Va default
	+setting.
	+This provides a single aggregated view under
	+.Li kern.eventlog. Ns Ao Ar name Ac Ns Li .* :
	+.Bl -column "kern.eventlog.<name>.sessions_created" -offset indent
	+.It Li kern.eventlog. Ns Ao Ar name Ac Ns Li .sessions_created
	+.It Li kern.eventlog. Ns Ao Ar name Ac Ns Li .sessions_active
	+.It Li kern.eventlog. Ns Ao Ar name Ac Ns Li .sessions_enabled
	+.It Li kern.eventlog. Ns Ao Ar name Ac Ns Li .default
	+.El
	+.Pp
	+The
	+.Li default
	+sysctl controls whether new sessions start enabled or disabled.
	+The initial value is 0
	+.Pq disabled ,
	+but can be overridden at boot via loader tunable
	+.Pq e.g., Li kern.eventlog.tcp.default=1 No in Pa /boot/loader.conf
	+or programmatically via
	+.Fn eventlog_provider_set_default .
	+It accepts four values:
	+.Bl -tag -width indent
	+.It 0
	+New sessions start disabled.
	+Sessions can be individually enabled later.
	+.It 1
	+New sessions start enabled.
	+.It \-1
	+Disable all currently active sessions across all providers with this
	+name, then set the default to 0.
	+For providers that register a
	+.Fa default_changed
	+callback, the framework skips the session iteration and defers it to
	+the callback.
	+.It 2
	+Enable all currently disabled sessions across all providers with this
	+name, then set the default to 1.
	+For providers that register a
	+.Fa default_changed
	+callback, the framework skips the session iteration and defers it to
	+the callback.
	+.El
	+.Pp
	+The stats structure is reference-counted: created when the first provider
	+with a given name is registered, shared when subsequent same-named
	+providers are created, and freed when the last provider with that name
	+is destroyed.
	+.Ss Extending The Provider Sysctl Node
	+Providers that need a handful of provider-specific knobs
	+.Pq e.g., a sampling rate
	+can hang additional
	+.Xr sysctl 9
	+children off the auto-generated
	+.Li kern.eventlog. Ns Ao Ar name Ac
	+node rather than creating a parallel
	+.Li kern. Ns Ao Ar name Ac
	+tree.
	+Two accessors expose the node and its
	+.Vt sysctl_ctx_list
	+for this purpose:
	+.Pp
	+.Bl -tag -width Ds -compact
	+.It Fn eventlog_provider_get_sysctl_node
	+Returns the
	+.Vt "struct sysctl_oid *"
	+for
	+.Li kern.eventlog. Ns Ao Ar name Ac .
	+.It Fn eventlog_provider_get_sysctl_ctx
	+Returns the
	+.Vt "struct sysctl_ctx_list *"
	+owned by the framework for this provider name.
	+.El
	+.Pp
	+Children added via these handles are freed automatically when the last
	+provider with the same name is destroyed, so callers should only register
	+children whose backing storage is valid for the lifetime of the provider.
	+For example, a provider with a tunable sampling rate could publish it as
	+.Li kern.eventlog. Ns Ao Ar name Ac Ns Li .hz :
	+.Bd -literal -offset indent
	+provider = eventlog_provider_create("myprov", &cfg);
	+
	+SYSCTL_ADD_INT(
	+ eventlog_provider_get_sysctl_ctx(provider),
	+ SYSCTL_CHILDREN(eventlog_provider_get_sysctl_node(provider)),
	+ OID_AUTO, "hz", CTLFLAG_RW, &my_hz, 0,
	+ "Samples per CPU per second");
	+.Ed
	+.Ss Dump State Callback
	+When a subscriber subscribes for the first time to a provider that
	+registered a
	+.Fa dump_callback ,
	+the framework schedules an asynchronous dump on its private
	+.Dq eventlog_dump
	+taskqueue.
	+The callback emits current state for all of the provider's existing sessions
	+so the new subscriber observes sessions that were created before it started
	+listening.
	+.Pp
	+The callback runs on a dedicated framework taskqueue thread after
	+.Fn eventlog_subscriber_add_subscription
	+has already returned to the caller, so it must not depend on any per-thread
	+state of the subscribing thread (e.g.
	+.Va curthread->td_vnet ,
	+.Va curthread->td_proc ) ;
	+providers that need a vnet context must establish one themselves with
	+.Fn VNET_FOREACH /
	+.Fn CURVNET_SET .
	+The callback should use normal event write functions or schema-generated
	+.Ql _LOG
	+macros; the framework automatically routes those writes to only the
	+subscriber that requested this dump.
	+.Pp
	+The dump taskqueue is single-threaded, so a provider's
	+.Fa dump_callback
	+is never invoked concurrently with itself or with any other dump_callback.
	+Re-subscribing an already-subscribed provider does
	+.Em not
	+re-fire the dump (the subscriber already has the state); only a brand-new
	+subscription enqueues a task.
	+.Pp
	+If
	+.Fa dump_callback
	+is
	+.Dv NULL ,
	+no task is enqueued.
	+With multi-provider support a separate task is enqueued for each matching
	+provider when subscribing by name.
	+.Pp
	+Once
	+.Fa dump_callback
	+returns the framework synthesises a single
	+.Dv EVENTLOG_DUMP_COMPLETE_ID
	+event for the requesting subscriber.
	+The event carries the dumping provider's id, a session id of zero, and an
	+empty payload, and is filtered with the same
	+.Dv EVENTLOG_LEVEL_INFO
	++
	+.Dv EVENTLOG_KEYWORD_SESSION
	+contract that gates
	+.Dv EVENTLOG_SESSION_CREATE_ID
	+and
	+.Dv EVENTLOG_SESSION_END_ID :
	+subscribers that opted out of the session keyword bit do not receive it.
	+Userspace consumers can key on
	+.Dv EVENTLOG_DUMP_COMPLETE_ID
	+to know that all replay events for the just-subscribed provider have
	+been delivered.
	+.Pp
	+Callers that need to observe a self-consistent post-dump state before
	+proceeding (test code, save-state tools) should call
	+.Fn eventlog_subscriber_drain_dumps
	+after subscribing; it blocks until every dump task this subscriber has
	+outstanding has finished.
	+.Fn eventlog_subscriber_destroy
	+implicitly drains, so callers do not have to coordinate teardown with
	+in-flight dumps.
	+.Ss Default Changed Callback
	+When the
	+.Li kern.eventlog. Ns Ao Ar name Ac Ns Li .default
	+sysctl is written, the framework invokes the optional
	+.Fa default_changed
	+callback on each provider that shares the name.
	+The callback receives the raw sysctl value:
	+.Bl -tag -width indent
	+.It 0 No or 1
	+Informational only.
	+The framework has updated the stored default.
	+.It \-1 No or 2
	+The framework does
	+.Em not
	+iterate sessions for providers that have a
	+.Fa default_changed
	+callback; the provider is responsible for enabling or disabling its own
	+sessions.
	+For providers without a callback, the framework iterates sessions itself.
	+.El
	+.Pp
	+The callback is invoked outside any eventlog lock; the provider may take
	+its own locks.
	+.Ss Provider Enablement
	+Each provider instance is enabled or disabled independently based on its
	+subscribers.
	+Keywords are OR'd across all subscribers for that provider instance.
	+The level is set to the most verbose level requested by any subscriber.
	+For example, if subscriber A requests INFO with keywords 0x1 and
	+subscriber B requests WARN with keywords 0x2, the provider is enabled at
	+INFO with keywords 0x3.
	+.Ss Sessions
	+A session represents a single entity being observed, such as one TCP
	+connection.
	+Sessions are created with
	+.Fn eventlog_session_create ,
	+which allocates a session from a UMA zone and associates it with a
	+provider.
	+The
	+.Fa session_id
	+is a provider-assigned 64-bit identifier
	+.Pq e.g., Va inp_gencnt No for TCP
	+that is embedded in every event header.
	+The meaning of
	+.Fa session_id
	+values is provider-defined.
	+.Pp
	+If
	+.Fa waitok
	+is true, allocations use
	+.Dv M_WAITOK ;
	+otherwise
	+.Dv M_NOWAIT
	+is used and the function may return
	+.Dv NULL .
	+.Pp
	+The optional
	+.Fa create_payload
	+is provider-specific data included in the
	+.Dv EVENTLOG_SESSION_CREATE_ID
	+event emitted at session creation.
	+If
	+.Dv NULL ,
	+a default payload containing only the creation timestamp is used.
	+.Pp
	+The session's initial enabled state is derived from the provider's
	+.Li default
	+sysctl.
	+The
	+.Dv EVENTLOG_SESSION_CREATE_ID
	+event is only emitted when the session is enabled.
	+.Pp
	+The public session structure exposes two fields for use by the
	+schema-generated
	+.Ql _ENABLED
	+macros:
	+.Bd -literal -offset indent
	+struct eventlog_session {
	+ enum eventlog_level effective_level;
	+ uint32_t effective_keywords;
	+};
	+.Ed
	+.Pp
	+The
	+.Fn eventlog_session_set_enabled
	+function enables or disables a session.
	+When disabled,
	+.Va effective_level
	+is set to
	+.Dv EVENTLOG_LEVEL_NONE
	+so that
	+.Ql _ENABLED
	+checks fail with no function call overhead.
	+.Pp
	+The
	+.Fn eventlog_session_set_filter
	+function sets a per-session level and keyword override.
	+When set, effective values use this override instead of the provider's
	+aggregate values.
	+Call
	+.Fn eventlog_session_set_enabled
	+after setting the filter to apply it.
	+.Pp
	+The
	+.Fn eventlog_session_destroy
	+function destroys a session and emits an
	+.Dv EVENTLOG_SESSION_END_ID
	+event if the session was enabled.
	+.Ss Subscribers
	+Subscribers are entities that receive events.
	+There are two types:
	+.Bl -tag -width indent
	+.It Sy Device subscribers
	+Created with
	+.Fn eventlog_subscriber_create_device .
	+These use per-CPU double-buffered storage with configurable buffer size
	+.Pq Dv EVENTLOG_BUFFER_SIZE_MIN No to Dv EVENTLOG_BUFFER_SIZE_MAX .
	+Events are read via
	+.Fn eventlog_subscriber_read ,
	+which merges events across CPUs by timestamp using a min-heap.
	+Device subscribers can serve both user-space and kernel readers.
	+.It Sy Callback subscribers
	+Created with
	+.Fn eventlog_subscriber_create_callback .
	+Events are delivered directly to the callback function with no buffering.
	+This provides the lowest latency but the callback must execute quickly
	+as it runs in the context of the event writer.
	+The callback function type is:
	+.Bd -literal -offset indent
	+typedef void (*eventlog_callback_t)(
	+ const struct eventlog_event_header *hdr,
	+ const char *provider_name,
	+ uint8_t provider_name_len,
	+ uint64_t session_id,
	+ const struct iovec *iov,
	+ int iovcnt,
	+ size_t payload_size,
	+ void *callback_arg);
	+.Ed
	+.El
	+.Pp
	+The
	+.Fn eventlog_subscriber_add_subscription
	+function subscribes to a named provider.
	+If the provider name matches multiple provider instances, a separate
	+internal subscription is created for each, but this is transparent to
	+the caller.
	+For every newly-subscribed provider that registered a
	+.Fa dump_callback
	+the framework schedules an asynchronous dump on the eventlog_dump
	+taskqueue;
	+see
	+.Sx Dump State Callback .
	+.Pp
	+The
	+.Fn eventlog_subscriber_drain_dumps
	+function blocks the caller until every dump task this subscriber has
	+outstanding has finished.
	+.Pp
	+The
	+.Fn eventlog_subscriber_destroy
	+function removes all subscriptions, frees resources, and updates
	+provider enablement.
	+It implicitly drains any pending dumps before freeing memory, so callers
	+do not have to coordinate destroy with in-flight dumps.
	+.Pp
	+The
	+.Fn eventlog_subscriber_get_stats
	+function fills the
	+.Fa stats
	+structure with current subscriber statistics including
	+.Va dropped_events .
	+.Ss Event Writing
	+Events are emitted using
	+.Fn eventlog_event_write
	+or
	+.Fn eventlog_event_write_at .
	+Both functions construct a 32-byte event header on the stack and deliver
	+the event to all subscribers that match the provider, level, and keyword
	+criteria.
	+.Pp
	+The
	+.Fn eventlog_event_write_at
	+variant accepts a pre-computed timestamp in microseconds since boot,
	+for use when the caller has already queried the time.
	+.Pp
	+In practice, events are typically emitted via schema-generated macros
	+rather than calling these functions directly.
	+See
	+.Sx SCHEMA-BASED CODE GENERATION
	+below.
	+.Ss Scatter/Gather Event Writing
	+For events that carry a variable-length payload, the
	+.Fn eventlog_event_write_gather
	+and
	+.Fn eventlog_event_write_gather_at
	+variants accept the payload as an
	+.Vt iovec
	+of
	+.Fa iovcnt
	+segments.
	+The framework prepends the 32-byte header and copies the concatenation
	+of every segment into each subscriber's ring buffer as part of the same
	+reserve step, so readers never observe a partially written event.
	+A zero segment count or any zero-length entry is legal.
	+.Pp
	+These variants are the preferred shape when a producer has a fixed head
	+followed by a variable-length tail: the caller builds a two-element iov
	+referring to its own source buffers directly, avoiding an intermediate
	+copy and the worst-case stack footprint of a composite struct.
	+.Pp
	+The schema generator emits calls to
	+.Fn eventlog_event_write_gather
	+for STRUCTs that declare a trailing varlen field.
	+Scalar callers can keep using
	+.Fn eventlog_event_write ;
	+internally it builds a one-element iov and takes the same write path,
	+so the two entry points deliver byte-identical events on the wire.
	+.Pp
	+Callback subscribers receive the payload in scatter/gather form: the
	+callback signature is
	+.Pp
	+.Bd -literal -offset indent
	+typedef void (*eventlog_callback_t)(
	+ const struct eventlog_event_header *hdr,
	+ const char *provider_name, uint8_t provider_name_len,
	+ uint64_t session_id,
	+ const struct iovec *iov, int iovcnt, size_t payload_size,
	+ void *callback_arg);
	+.Ed
	+.Pp
	+Delivery happens inside an
	+.Xr smr 9
	+critical section where
	+.Xr malloc 9
	+is not permitted, which is why the framework does not compact iov
	+segments into a flat buffer.
	+For scalar producers
	+.Fa iovcnt
	+is 1 and the payload is simply
	+.Fa iov[0].iov_base .
	+For producers that use
	+.Fn eventlog_event_write_gather
	+(the usual shape for schema-generated varlen events), the callback either
	+walks the segments in order or
	+.Xr memcpy 3 Ns s
	+them into a caller-sized buffer.
	+The
	+.Fa iov
	+and
	+.Fa iov[*].iov_base
	+pointers are only valid for the duration of the callback; callbacks must
	+not retain them.
	+.Pp
	+Device subscribers are unaffected by this: iov segments are copied
	+straight from the caller's buffers into the per-CPU ring, up to the
	+wire-format
	+.Dv UINT16_MAX
	+.Va event_length
	+cap.
	+.Ss Event Write Path
	+The event write path is designed for minimal overhead:
	+.Bl -enum -compact
	+.It
	+The schema-generated
	+.Ql _LOG
	+macro first calls
	+.Ql _ENABLED ,
	+which checks the session's
	+.Va effective_level
	+and
	+.Va effective_keywords
	+fields directly with no function call.
	+If disabled, the macro returns immediately.
	+.It
	+If enabled, the event structure is initialized on the stack and
	+.Fn eventlog_event_write
	+is called.
	+.It
	+The function enters an SMR critical section via
	+.Fn smr_enter ,
	+which disables thread preemption and pins the thread to the current CPU.
	+This rules out thread-level writer-vs-writer contention; a hardware NMI on
	+the same CPU can still nest inside the writer (see
	+.Sx Per-CPU Buffering ) .
	+.It
	+All active subscribers are iterated under SMR protection
	+.Pq no locks .
	+For each matching subscriber, the event is routed by type:
	+device subscribers receive a buffer write; callback subscribers receive
	+a direct function invocation.
	+.It
	+The SMR critical section is exited via
	+.Fn smr_exit .
	+.El
	+.Ss Event Format
	+Each event is a contiguous byte sequence consisting of a 32-byte header
	+followed by a variable-length payload.
	+All multi-byte integer fields are in host-native byte order
	+.Pq little-endian on amd64 and aarch64 .
	+Events are packed back-to-back with no inter-event padding.
	+.Pp
	+The event header layout is:
	+.Bl -column "Offset" "Size" "uint64_t" "event_length" -offset indent
	+.It Sy Offset Ta Sy Size Ta Sy Type Ta Sy Field
	+.It 0 Ta 2 Ta Vt uint16_t Ta Va event_length
	+.It 2 Ta 2 Ta Vt uint16_t Ta Va cpu
	+.It 4 Ta 2 Ta Vt uint16_t Ta Va provider_id
	+.It 6 Ta 2 Ta Vt uint16_t Ta (reserved)
	+.It 8 Ta 8 Ta Vt uint64_t Ta Va timestamp
	+.It 16 Ta 8 Ta Vt uint64_t Ta Va session_id
	+.It 24 Ta 4 Ta Vt uint32_t Ta Va event_id
	+.It 28 Ta 4 Ta Vt int32_t Ta Va thread_id
	+.El
	+.Pp
	+The
	+.Va event_length
	+field gives the total event size in bytes
	+.Pq header + payload .
	+The minimum value is 32
	+.Pq header only ;
	+the maximum is 65535.
	+The
	+.Va timestamp
	+is in microseconds since boot, obtained via
	+.Fn binuptime
	+and converted with
	+.Fn bintime2us .
	+The
	+.Va session_id
	+is provider-defined.
	+The
	+.Va thread_id
	+is the kernel thread ID
	+.Pq Vt lwpid_t ;
	+0 if no thread context.
	+.Pp
	+Two
	+.Va event_id
	+values are reserved for session lifecycle events emitted by the framework:
	+.Bl -column "EVENTLOG_SESSION_CREATE_ID" "0xFFFFFFFE" -offset indent
	+.It Sy Constant Ta Sy Value Ta Sy Meaning
	+.It Dv EVENTLOG_SESSION_CREATE_ID Ta Li 0xFFFFFFFE Ta Session created
	+.It Dv EVENTLOG_SESSION_END_ID Ta Li 0xFFFFFFFF Ta Session destroyed
	+.El
	+.Pp
	+These use the keyword
	+.Dv EVENTLOG_KEYWORD_SESSION
	+.Pq Li 0x80000000
	+and level
	+.Dv EVENTLOG_LEVEL_INFO .
	+The payload format is described in the provider's schema;
	+.Dv EVENTLOG_SESSION_END_ID
	+has an empty payload.
	+.Pp
	+The payload immediately follows the header with no padding.
	+Its size is
	+.Va event_length
	+minus 32 bytes and its format is provider-specific, defined by schema files.
	+.Ss Per-CPU Buffering
	+Device subscribers use per-CPU double-buffering.
	+Each CPU has two buffers per subscriber: an active buffer where writers
	+append events, and a reader buffer where the reader consumes events.
	+.Pp
	+All writer-side mutable state and the reader buffer length are packed
	+into a single 64-bit word
	+.Pq Va packed_state
	+with the following bit layout:
	+.Bl -column "Bits [63:32]" "swap_allowed" -offset indent
	+.It Sy Bits Ta Sy Field Ta Sy Description
	+.It Li [63:32] Ta Va reader_len Ta Bytes in reader buffer (set at swap, 30 bits)
	+.It Li [31:2] Ta Va commit_pos Ta Byte offset of next write (30 bits)
	+.It Li [1] Ta Va swap_allowed Ta Reader has drained; writers may swap
	+.It Li [0] Ta Va active_buf Ta Which buffer is active (0 or 1)
	+.El
	+.Pp
	+On targets where the MI
	+.Fn atomic_*_64
	+API is available
	+.Pq Dv __LP64__ , i.e.\& every 64-bit architecture ,
	+every state transition (writer commit, buffer swap, reader drain) is a
	+single
	+.Fn atomic_fcmpset_64 ;
	+the path is lock-free and NMI-safe by construction.
	+.Pp
	+On 32-bit targets that do not provide
	+.Fn atomic_*_64
	+(FreeBSD's
	+.Pa sys/atomic_common.h
	+gates
	+.Fn atomic_load_64
	+on
	+.Dv __LP64__
	+for the same reason),
	+.Va packed_state
	+is the same 64-bit word but every state operation takes a per-pcpu_buf
	+.Dv MTX_SPIN
	+that serialises access to the otherwise non-atomic 64-bit field.
	+NMI-safety is provided up-front: the writer entry point
	+.Fn eventlog_subscriber_write_event_device
	+calls
	+.Fn mtx_owned "&pcpu_buf->swap_lock"
	+and drops the event if true.
	+In NMI context
	+.Va curthread
	+is the interrupted thread, so
	+.Fn mtx_owned
	+is true exactly when an NMI fired on a thread that already holds the
	+swap lock; calling
	+.Fn mtx_lock_spin
	+in that case would deadlock the NMI handler against the interrupted
	+thread, so dropping the event (counted in
	+.Va dropped_events )
	+is the only safe choice.
	+.Pp
	+Because
	+.Fn smr_enter
	+disables thread preemption and pins the thread to a CPU, no two threads
	+can write the same per-CPU buffer concurrently.
	+A hardware NMI on the same CPU can still nest inside an in-progress
	+thread-level writer (NMIs are not blocked by critical sections), so the
	+commit-CAS retry loop re-derives
	+.Va active
	+and
	+.Va commit_pos
	+from the post-CAS state on every failure and redoes the write at the new
	+offset; an NMI's intervening commit (with or without a buffer swap) is
	+therefore preserved.
	+.Pp
	+Buffer swap publishes the frozen
	+.Va commit_pos
	+as
	+.Va reader_len
	+and flips
	+.Va active_buf .
	+A reader observing
	+.Va swap_allowed = 0
	+is guaranteed to see the matching
	+.Va reader_len > 0 .
	+Writers perform a proactive swap when the active buffer is full and
	+.Va swap_allowed
	+is set; otherwise the event is dropped and the subscriber's
	+.Va dropped_events
	+counter is incremented.
	+The reader swaps buffers when it needs data.
	+Writers never spin waiting for the reader.
	+.Ss Timestamp Epoch Boundary
	+Each
	+.Fn read
	+call delivers events bounded by a per-read epoch.
	+After the initial buffer swap, the reader captures a
	+.Va read_timestamp .
	+Events with timestamps beyond this epoch remain in the reader buffer
	+for the next
	+.Fn read
	+call.
	+.Pp
	+During the merge loop, events are delivered in strict timestamp order
	+using a min-heap of CPUs sorted by next-event timestamp.
	+When an epoch boundary is hit, a resweep of idle CPUs catches events
	+from writers that committed within the epoch but whose CPU was previously
	+inactive.
	+.Ss Schema-Based Code Generation
	+Provider-specific event schemas are defined in
	+.Pa include/eventlog/
	+using
	+.Pa .src
	+files with the naming convention
	+.Ao Ar provider Ac Ns Pa _eventlog_schema.src .
	+The
	+.Pa eventlog_gen.awk
	+script processes these files in two modes:
	+.Bl -tag -width indent
	+.It Sy Producer mode Pq Fl h
	+Generates event structure definitions,
	+.Ql _ENABLED()
	+check macros,
	+.Ql _LOG()
	+macros
	+.Pq with enablement check ,
	+and
	+.Ql _LOG_ALWAYS()
	+macros
	+.Pq unconditional .
	+.It Sy Consumer mode Pq Fl c
	+Generates payload formatters, enum and flag lookup functions, and
	+.Fn event_id_to_name
	+dispatch functions for user-space tools.
	+.El
	+.Pp
	+The generated code follows this pattern:
	+.Bd -literal -offset indent
	+#define TCP_EVENTLOG_IN_ENABLED(__session) \e
	+ ((__session) != NULL && \e
	+ (__session)->effective_level >= EVENTLOG_LEVEL_VERBOSE && \e
	+ ((__session)->effective_keywords & TCP_EVENTLOG_KEYWORD_RX))
	+
	+#define TCP_EVENTLOG_IN_LOG(__session, ...) \e
	+ do { \e
	+ if (TCP_EVENTLOG_IN_ENABLED(__session)) \e
	+ TCP_EVENTLOG_IN_LOG_ALWAYS(__session, ...); \e
	+ } while (0)
	+.Ed
	+.Pp
	+The enablement check reads two structure fields directly with no
	+function call and no lock, making the fast path when disabled a single
	+branch that is not taken.
	+.Ss Device Interface
	+The
	+.Pa /dev/eventlog
	+character device supports the following operations:
	+.Bl -tag -width indent
	+.It Fn open
	+Opens the device.
	+No subscriber is created at this point.
	+The framework is host-global and is not exposed to jailed processes:
	+.Fn open
	+fails with
	+.Er EPERM
	+when the calling thread is in a jail
	+.Pq Va cr_prison No is not Va prison0 .
	+.It Fn close
	+Destroys the subscriber if one was created, and updates provider
	+enablement.
	+.It Fn read
	+Reads events merged by timestamp across per-CPU buffers.
	+Each
	+.Fn read
	+returns zero or more complete events; no partial events are delivered.
	+Blocks for up to one second if no data is available, unless
	+.Dv FNONBLOCK
	+is set.
	+.It Fn ioctl
	+Manages subscriptions and buffer configuration using the following
	+commands:
	+.Bl -tag -width indent
	+.It Dv EVENTLOG_IOCTL_CREATE_SIZE Ns Pq Fa count
	+Creates a subscriber with the specified per-CPU buffer size and
	+subscribes to providers in one atomic operation.
	+The argument is a variable-length
	+.Vt "struct eventlog_create_req" .
	+.It Dv EVENTLOG_IOCTL_DESTROY
	+Unsubscribes from all providers and destroys the subscriber.
	+.It Dv EVENTLOG_IOCTL_GET_STATS
	+Returns the subscriber's
	+.Va dropped_events
	+count.
	+.It Dv EVENTLOG_IOCTL_GET_PROVIDERS
	+Returns the list of subscribed providers with their numeric IDs and
	+names.
	+Multiple entries may share the same name when multi-provider support
	+is in use.
	+.El
	+.El
	+.Ss Initialization
	+The framework initializes in three boot-time phases:
	+.Bl -enum -compact
	+.It
	+Mutexes are initialized at
	+.Dv SI_SUB_LOCK .
	+.It
	+The session UMA zone is created at
	+.Dv SI_SUB_KMEM .
	+.It
	+The
	+.Pa /dev/eventlog
	+character device is created at
	+.Dv SI_SUB_DRIVERS .
	+.El
	+.Pp
	+Providers register themselves during their own subsystem initialization.
	+.Sh RETURN VALUES
	+The
	+.Fn eventlog_provider_create
	+function returns a pointer to the new provider, or
	+.Dv NULL
	+on failure.
	+.Pp
	+The
	+.Fn eventlog_session_create
	+function returns a pointer to the new session, or
	+.Dv NULL
	+if allocation fails
	+.Pq when Fa waitok No is false .
	+.Pp
	+The
	+.Fn eventlog_provider_get_default
	+function returns 0
	+.Pq sessions start disabled
	+or 1
	+.Pq sessions start enabled .
	+.Pp
	+The
	+.Fn eventlog_provider_get_sysctl_node
	+and
	+.Fn eventlog_provider_get_sysctl_ctx
	+functions return the provider's auto-generated
	+.Xr sysctl 9
	+node and its backing context, which are owned by the framework.
	+Both always succeed for a valid provider.
	+.Pp
	+The
	+.Fn eventlog_session_is_enabled
	+function returns non-zero if the session is enabled, or 0 if disabled or
	+.Dv NULL .
	+.Pp
	+The
	+.Fn eventlog_provider_get_level
	+function returns the current aggregate log level.
	+.Pp
	+The
	+.Fn eventlog_provider_get_keywords
	+function returns the current aggregate keyword mask.
	+.Pp
	+The
	+.Fn eventlog_subscriber_create_device
	+and
	+.Fn eventlog_subscriber_create_callback
	+functions return a pointer to the new subscriber, or
	+.Dv NULL
	+on failure.
	+.Pp
	+The
	+.Fn eventlog_subscriber_add_subscription
	+function returns 0 on success or an error code on failure.
	+.Pp
	+The
	+.Fn eventlog_subscriber_read
	+function returns 0 on success or an error code on failure.
	+It returns
	+.Er EAGAIN
	+when no data is available and
	+.Dv FNONBLOCK
	+is set.
	+.Sh ERRORS
	+.Fn open
	+on
	+.Pa /dev/eventlog
	+may fail with:
	+.Bl -tag -width Er
	+.It Bq Er EPERM
	+The calling thread is in a jail
	+.Pq Va cr_prison No is not Va prison0 .
	+The eventlog framework is host-global and is not exposed to jailed
	+processes.
	+.It Bq Er ENODEV
	+The device was opened with
	+.Dv FWRITE ,
	+.Dv FEXEC ,
	+.Dv FAPPEND ,
	+or
	+.Dv O_TRUNC .
	+The device is read-only.
	+.El
	+.Sh SEE ALSO
	+.Xr elog 1 ,
	+.Xr elog 5 ,
	+.Xr tcp 4 ,
	+.Xr smr 9 ,
	+.Xr sysctl 9 ,
	+.Xr tcp_functions 9
	+.Sh HISTORY
	+The
	+.Nm
	+framework first appeared in
	+.Fx 16.0 .
	+.Sh AUTHORS
	+The
	+.Nm
	+framework was developed by
	+.An Netflix, Inc .
	diff --git a/sys/conf/files b/sys/conf/files
	--- a/sys/conf/files
	+++ b/sys/conf/files
	@@ -3862,6 +3862,7 @@
	kern/kern_environment.c standard
	kern/kern_et.c standard
	kern/kern_event.c standard
	+kern/kern_eventlog.c standard
	kern/kern_exec.c standard
	kern/kern_exit.c standard
	kern/kern_fail.c standard
	diff --git a/sys/conf/kern.pre.mk b/sys/conf/kern.pre.mk
	--- a/sys/conf/kern.pre.mk
	+++ b/sys/conf/kern.pre.mk
	@@ -74,6 +74,44 @@

	INCLUDES= ${NOSTDINC} ${INCLMAGIC} -I. -I$S -I$S/contrib/ck/include

	+# Generate eventlog provider headers from schema files in
	+# include/eventlog/. Each *_eventlog_schema.src is fed through
	+# include/eventlog/eventlog_gen.awk to produce <provider>_eventlog.h
	+# under ${.OBJDIR}/include/eventlog/, and that directory's parent is
	+# added to INCLUDES so kernel sources can do
	+# #include <eventlog/<provider>_eventlog.h>.
	+_EVENTLOG_HEADER_DIR= ${.OBJDIR}/include/eventlog
	+_EVENTLOG_SCHEMA_DIR= ${SRCTOP}/include/eventlog
	+_EVENTLOG_SCHEMAS!= find ${_EVENTLOG_SCHEMA_DIR} -name '*_eventlog_schema.src' -type f 2>/dev/null \| ${AWK} -F/ '{print $$NF}' \|\| echo ""
	+.if !empty(_EVENTLOG_SCHEMAS)
	+.if !make(clean) && !make(cleandir) && !make(clobber)
	+_EVENTLOG_GENHDRS!= mkdir -p ${_EVENTLOG_HEADER_DIR}; \
	+ awk_script="${_EVENTLOG_SCHEMA_DIR}/eventlog_gen.awk"; \
	+ for schema in ${_EVENTLOG_SCHEMAS}; do \
	+ schema_path="${_EVENTLOG_SCHEMA_DIR}/$$schema"; \
	+ provider=$$(${AWK} '/^PROVIDER/ {print tolower($$2); exit}' "$$schema_path" 2>/dev/null); \
	+ [ -n "$$provider" ] \|\| continue; \
	+ header="${_EVENTLOG_HEADER_DIR}/$${provider}_eventlog.h"; \
	+ if [ ! -f "$$header" ] \|\| \
	+ [ "$$schema_path" -nt "$$header" ] \|\| \
	+ [ "$$awk_script" -nt "$$header" ]; then \
	+ cd ${SRCTOP} && ${AWK} -v outdir="${_EVENTLOG_HEADER_DIR}" -f include/eventlog/eventlog_gen.awk include/eventlog/$$schema -h; \
	+ fi; \
	+ done; echo done
	+.endif
	+.for schema in ${_EVENTLOG_SCHEMAS}
	+_EVENTLOG_PROVIDER_${schema}!= ${AWK} '/^PROVIDER/ {print tolower($$2); exit}' ${_EVENTLOG_SCHEMA_DIR}/${schema} 2>/dev/null \|\| echo ""
	+.if !empty(_EVENTLOG_PROVIDER_${schema})
	+_EVENTLOG_HEADER_${schema}= ${_EVENTLOG_HEADER_DIR}/${_EVENTLOG_PROVIDER_${schema}}_eventlog.h
	+${_EVENTLOG_HEADER_${schema}}: ${_EVENTLOG_SCHEMA_DIR}/eventlog_gen.awk ${_EVENTLOG_SCHEMA_DIR}/${schema}
	+ @mkdir -p ${_EVENTLOG_HEADER_DIR}
	+ @cd ${SRCTOP} && ${AWK} -v outdir="${_EVENTLOG_HEADER_DIR}" -f include/eventlog/eventlog_gen.awk include/eventlog/${schema} -h
	+BEFORE_DEPEND+= ${_EVENTLOG_HEADER_${schema}}
	+.endif
	+.endfor
	+INCLUDES+= -I${_EVENTLOG_HEADER_DIR:H}
	+.endif
	+
	CFLAGS= ${COPTFLAGS} ${DEBUG}
	CFLAGS+= ${INCLUDES} -D_KERNEL -DHAVE_KERNEL_OPTION_HEADERS -include opt_global.h
	CFLAGS_PARAM_INLINE_UNIT_GROWTH?=100
	diff --git a/sys/conf/kmod.mk b/sys/conf/kmod.mk
	--- a/sys/conf/kmod.mk
	+++ b/sys/conf/kmod.mk
	@@ -491,6 +491,37 @@
	${SYSDIR}/dev/bhnd/nvram/nvram_map -h
	.endif

	+# Generate an eventlog provider header from a single schema file. A kmod
	+# opts in by setting EVENTLOG_SCHEMA=<provider>_eventlog_schema.src; the
	+# schema is fed through include/eventlog/eventlog_gen.awk to produce
	+# <provider>_eventlog.h under ${OBJTOP}/sys/include/eventlog/. The
	+# header is added to SRCS so depend / clean see it, and its parent
	+# directory is added to -I so the kmod's sources can do
	+# #include <eventlog/<provider>_eventlog.h>.
	+.if !empty(EVENTLOG_SCHEMA)
	+EVENTLOG_SCHEMA_PATH= ${SRCTOP}/include/eventlog/${EVENTLOG_SCHEMA}
	+EVENTLOG_PROVIDER!= ${AWK} '/^PROVIDER/ {print tolower($$2); exit}' ${EVENTLOG_SCHEMA_PATH}
	+EVENTLOG_HEADER_DIR= ${OBJTOP}/sys/include/eventlog
	+EVENTLOG_HEADER_DIR:= ${EVENTLOG_HEADER_DIR:tA}
	+EVENTLOG_HEADER= ${EVENTLOG_PROVIDER}_eventlog.h
	+EVENTLOG_HEADER_PATH= ${EVENTLOG_HEADER_DIR}/${EVENTLOG_HEADER}
	+SRCS+= ${EVENTLOG_HEADER_PATH}
	+CLEANFILES+= ${EVENTLOG_HEADER_PATH}
	+.if !make(clean) && !make(cleandir) && !make(clobber)
	+_EVENTLOG_GENHDR!= mkdir -p ${EVENTLOG_HEADER_DIR}; \
	+ if [ ! -f ${EVENTLOG_HEADER_PATH} ] \|\| \
	+ [ ${EVENTLOG_SCHEMA_PATH} -nt ${EVENTLOG_HEADER_PATH} ] \|\| \
	+ [ ${SRCTOP}/include/eventlog/eventlog_gen.awk -nt ${EVENTLOG_HEADER_PATH} ]; then \
	+ cd ${SRCTOP} && ${AWK} -v outdir=${EVENTLOG_HEADER_DIR} -f include/eventlog/eventlog_gen.awk ${EVENTLOG_SCHEMA_PATH} -h; \
	+ fi; echo done
	+.endif
	+CFLAGS+= -I${EVENTLOG_HEADER_DIR:H}
	+${EVENTLOG_HEADER_PATH}: ${SRCTOP}/include/eventlog/eventlog_gen.awk ${EVENTLOG_SCHEMA_PATH}
	+ @mkdir -p ${EVENTLOG_HEADER_DIR}
	+ @cd ${SRCTOP} && ${AWK} -v outdir=${EVENTLOG_HEADER_DIR} -f include/eventlog/eventlog_gen.awk ${EVENTLOG_SCHEMA_PATH} -h
	+beforedepend: ${EVENTLOG_HEADER_PATH}
	+.endif
	+
	.if !empty(SRCS:Mbhnd_nvram_map_data.h)
	CLEANFILES+= bhnd_nvram_map_data.h
	bhnd_nvram_map_data.h: ${SYSDIR}/dev/bhnd/tools/nvram_map_gen.awk \
	diff --git a/sys/kern/kern_eventlog.c b/sys/kern/kern_eventlog.c
	new file mode 100644
	--- /dev/null
	+++ b/sys/kern/kern_eventlog.c
	@@ -0,0 +1,2630 @@
	+/*
	+ * Copyright (c) 2026 Netflix, Inc.
	+ *
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ */
	+
	+/*
	+ * MEMORY ACCESS AND SYNCHRONIZATION MODEL
	+ * =======================================
	+ * Per-CPU double-buffering: Two buffers per CPU. Writers use "active" buffer;
	+ * readers use "reader" buffer (1 - active). Swap when reader is empty and
	+ * active has data.
	+ *
	+ * Invariant: There is NEVER partial data in either buffer. Each buffer
	+ * contains zero or more complete events (header + payload).
	+ *
	+ * Per-CPU writer concurrency: All write paths enter via smr_enter() which
	+ * calls critical_enter(), disabling thread preemption. Thread-level
	+ * writer-vs-writer contention is therefore impossible. Hardware NMIs are
	+ * NOT blocked by critical sections, however, so an NMI-context writer can
	+ * nest inside an in-progress thread-level writer on the same CPU. The
	+ * protocol tolerates this: every state-changing step (try_swap, commit-CAS)
	+ * re-derives active and commit_pos from the post-CAS state and re-checks
	+ * capacity, so an NMI's intervening commit (with or without a buffer swap)
	+ * is preserved.
	+ *
	+ * Packed state: reader_len (30 bits), commit_pos (30 bits), active_buf, and
	+ * swap_allowed are packed into a single 64-bit word (packed_state). A
	+ * single CAS atomically publishes any state transition (writer commit,
	+ * buffer swap, reader drain).
	+ *
	+ * Writer:
	+ * (1) Load packed_state to get commit_pos and active_buf.
	+ * (2) Check capacity: if commit_pos + event_len > buffer_size, attempt a
	+ * proactive swap if swap_allowed. After try_swap, re-derive active
	+ * and commit_pos from the post-CAS state and re-check capacity (an
	+ * NMI on this CPU may have already swapped or partially filled the
	+ * new active buffer). Drops only if swap is not allowed (reader
	+ * still draining) or no room remains after the swap.
	+ * (3) Write: memcpy event data to buffer at commit_pos offset.
	+ * (4) Commit: CAS packed_state to advance commit_pos. On CAS failure,
	+ * re-derive active and commit_pos from the updated state. If either
	+ * changed (peer reader swap, NMI commit, or NMI swap-and-commit),
	+ * redo the write at the new offset; otherwise just recompute the
	+ * desired packed_state value and retry the CAS.
	+ *
	+ * Reader: Single reader only. Reads from reader buffer. No lock needed for
	+ * reads. Advances read_pos by full event lengths. When fully drained, zeros
	+ * read_pos/reader_len then eagerly sets swap_allowed (giving writers the
	+ * earliest possible permission to proactively swap on buffer-full).
	+ *
	+ * Swap publication:
	+ * reader_len is packed into the upper 32 bits of packed_state and the
	+ * swap is a single transition that flips active_buf, zeros commit_pos,
	+ * clears swap_allowed, and publishes reader_len = old commit_pos. Two
	+ * concurrent try_swap callers cannot clobber each other: exactly one
	+ * wins; the loser sees the post-swap state and reader_len = winner's
	+ * commit_pos.
	+ *
	+ * On targets where the MI atomic_*_64 API is available (__LP64__, i.e.
	+ * every 64-bit FreeBSD architecture) the swap is a single
	+ * atomic_fcmpset_64; the path is lock-free and NMI-safe by construction.
	+ *
	+ * On 32-bit targets that do not provide atomic_*_64 (FreeBSD's MI
	+ * atomic_load_64 is itself gated on __LP64__),
	+ * the same 64-bit packed_state is used but every state operation (load,
	+ * commit, swap, drain) takes a per-pcpu_buf MTX_SPIN that serialises
	+ * access to the otherwise non-atomic 64-bit field. No atomics are
	+ * needed inside the helpers; the lock provides both serialisation and
	+ * visibility. NMI-safety is provided up-front: the writer entry point
	+ * checks mtx_owned(&pcpu_buf->swap_lock) and drops the event if true
	+ * (in NMI context curthread is the interrupted thread, so mtx_owned()
	+ * being true means we would mtx_lock_spin against ourselves and
	+ * deadlock). No caller-visible flag or per-helper trylock is needed.
	+ *
	+ * Key properties (both implementations):
	+ * - Writers NEVER spin waiting for the reader. They perform the swap
	+ * themselves or drop if swap is not allowed.
	+ * - No critical_enter needed: the writer's commit CAS detects and
	+ * handles concurrent reader swaps by retrying.
	+ * - Writer can proactively swap on buffer-full when swap_allowed is set,
	+ * reducing event drops.
	+ * - swap_allowed=0 implies reader_len > 0 (try_swap is only invoked with
	+ * commit_pos > 0, and the swap publishes reader_len = commit_pos).
	+ */
	+
	+#define EVENTLOG_INTERNAL
	+#include <sys/cdefs.h>
	+#include <sys/param.h>
	+#include <sys/systm.h>
	+#include <sys/kernel.h>
	+#include <sys/condvar.h>
	+#include <sys/lock.h>
	+#include <sys/malloc.h>
	+#include <sys/mutex.h>
	+#include <sys/sx.h>
	+#include <sys/jail.h>
	+#include <sys/proc.h>
	+#include <sys/queue.h>
	+#include <sys/ck.h>
	+#include <sys/smr.h>
	+#include <sys/sbuf.h>
	+#include <sys/sysctl.h>
	+#include <sys/taskqueue.h>
	+#include <sys/counter.h>
	+#include <sys/sysent.h>
	+#include <sys/sysproto.h>
	+#include <sys/eventlog.h>
	+#include <sys/eventlog_subscriber.h>
	+#include <sys/smp.h>
	+#include <sys/time.h>
	+#include <sys/limits.h>
	+#include <machine/cpu.h>
	+#include <machine/atomic.h>
	+#include <sys/conf.h>
	+#include <fs/devfs/devfs.h>
	+#include <sys/fcntl.h>
	+#include <sys/uio.h>
	+#include <sys/libkern.h>
	+#include <sys/ioccom.h>
	+#include <sys/time.h>
	+#include <vm/vm.h>
	+#include <vm/uma.h>
	+
	+/* Used to disable inlining to help debug performance issues via flamegraphs. */
	+#define EVENTLOG_INLINING //__noinline
	+
	+MALLOC_DEFINE(M_EVENTLOG, "eventlog", "eventlog subsystem");
	+
	+/*
	+ * Full definition of eventlog_session - private to this file; header
	+ * has partial/forward only.
	+ */
	+struct eventlog_session {
	+ enum eventlog_level effective_level;
	+ uint32_t effective_keywords;
	+ /* Private fields - only visible in this file */
	+ struct eventlog_provider *provider;
	+ LIST_ENTRY(eventlog_session) link;
	+ uint64_t session_id; /* Unique id (e.g., inp_gencnt for TCP) */
	+ uint64_t created_at; /* us since boot when session was created */
	+ enum eventlog_level override_level;
	+ uint32_t override_keywords;
	+ uint8_t disabled;
	+ uint8_t has_override;
	+};
	+
	+/*
	+ * Shared statistics for all providers with the same name.
	+ * Reference-counted: created on first provider, freed when last is destroyed.
	+ * Protected by evl.providers_lock.
	+ */
	+struct eventlog_provider_stats {
	+ int refcount;
	+ int default_enabled;
	+ LIST_ENTRY(eventlog_provider_stats) link;
	+ counter_u64_t sessions_created;
	+ counter_u64_t sessions_active;
	+ counter_u64_t sessions_enabled;
	+ struct sysctl_ctx_list sysctl_ctx;
	+ /*
	+ * kern.eventlog.<name>; exposed to providers via
	+ * eventlog_provider_get_sysctl_node().
	+ */
	+ struct sysctl_oid *sysctl_node;
	+ char name[EVENTLOG_PROVIDER_NAME_MAX];
	+};
	+
	+/* Full definition of eventlog_provider */
	+struct eventlog_provider {
	+ struct mtx sessions_lock;
	+ LIST_HEAD(, eventlog_session) sessions;
	+ LIST_ENTRY(eventlog_provider) link;
	+ struct eventlog_provider_stats *stats;
	+ eventlog_provider_dump_state_t dump_callback;
	+ void *dump_callback_arg;
	+ eventlog_default_changed_t default_changed;
	+ void *default_changed_arg;
	+ eventlog_subscribers_changed_t subscribers_changed;
	+ void *subscribers_changed_arg;
	+ enum eventlog_level level;
	+ uint32_t keywords;
	+ bool has_subscribers; /* tracked under sessions_lock */
	+ uint16_t provider_id; /* Unique ID assigned on registration */
	+ uint8_t name_len; /* excluding null terminator */
	+ char name[EVENTLOG_PROVIDER_NAME_MAX];
	+};
	+
	+/*
	+ * Full definition of eventlog_subscription. CK_SLIST for lock-free traversal
	+ * in SMR read path.
	+ */
	+struct eventlog_subscription {
	+ CK_SLIST_ENTRY(eventlog_subscription) link;
	+ struct eventlog_provider *provider;
	+ enum eventlog_level level;
	+ uint32_t keywords;
	+};
	+
	+/*
	+ * Per-CPU buffer structure for double-buffering. See "MEMORY ACCESS AND
	+ * SYNCHRONIZATION MODEL" at the top of this file for the protocol; this
	+ * block documents only the data layout.
	+ *
	+ * packed_state layout:
	+ * [63:32] reader_len - bytes in reader buffer (set at swap)
	+ * [31:2] commit_pos - bytes committed to active buffer (= write cursor)
	+ * [1] swap_allowed - reader buffer is empty, writer may proactively swap
	+ * [0] active_buf - which buffer (0 or 1) is the active writer buffer
	+ *
	+ * 30-bit commit_pos and 30-bit reader_len each support buffers up to 1 GB
	+ * (the enforced maximum). Initialised with swap_allowed=1. Because
	+ * commit_pos lives in [31:2], a writer commit can simply add
	+ * (event_len << 2) to packed_state without disturbing the upper bits
	+ * (commit_pos + event_len <= buffer_size <= 1 GB rules out overflow).
	+ * The SMR critical section pins the writer to one CPU, so commit_pos
	+ * also serves as the writer's reservation cursor.
	+ *
	+ * EVENTLOG_FORCE_SWAP_LOCK overrides the LP64 detection so the fallback
	+ * path can be compile- and run-tested on 64-bit hosts.
	+ */
	+#if defined(__LP64__) && !defined(EVENTLOG_FORCE_SWAP_LOCK)
	+#define EVENTLOG_HAS_ATOMIC64 1
	+#endif
	+
	+#define EVTLOG_ACTIVE_BUF 0x1U
	+#define EVTLOG_SWAP_ALLOWED 0x2U
	+#define EVTLOG_COMMIT_SHIFT 2
	+
	+#define EVTLOG_READER_LEN_SHIFT 32
	+#define EVTLOG_PACK_READER_LEN(rl) \
	+ (((uint64_t)(uint32_t)(rl)) << EVTLOG_READER_LEN_SHIFT)
	+#define EVTLOG_READER_LEN_MASK \
	+ (((uint64_t)UINT32_MAX) << EVTLOG_READER_LEN_SHIFT)
	+
	+struct eventlog_percpu_buffer {
	+ void buffers[2]; / Two buffers: [0] and [1] */
	+ uint32_t buffer_size;
	+ uint32_t read_pos; /* Read cursor in reader buffer */
	+#ifndef EVENTLOG_HAS_ATOMIC64
	+ struct mtx swap_lock; /* MTX_SPIN; covers all state ops */
	+#endif
	+ volatile uint64_t packed_state; /* See layout above */
	+} __aligned(CACHE_LINE_SIZE);
	+
	+/*
	+ * Atomic state abstraction. evtlog_state_t carries the entire per-CPU
	+ * buffer state observable by callers as a single uint64_t with the layout
	+ * documented above. Both implementations operate on the same word; only
	+ * the synchronisation primitive (atomic_*_64 vs spin-mutex) differs.
	+ */
	+typedef uint64_t evtlog_state_t;
	+
	+static inline int
	+evtlog_state_active(evtlog_state_t s)
	+{
	+ return ((int)(s & EVTLOG_ACTIVE_BUF));
	+}
	+
	+static inline uint32_t
	+evtlog_state_commit_pos(evtlog_state_t s)
	+{
	+ return (((uint32_t)s) >> EVTLOG_COMMIT_SHIFT);
	+}
	+
	+static inline bool
	+evtlog_state_swap_allowed(evtlog_state_t s)
	+{
	+ return ((s & EVTLOG_SWAP_ALLOWED) != 0);
	+}
	+
	+static inline uint32_t
	+evtlog_state_reader_len(evtlog_state_t s)
	+{
	+ return ((uint32_t)(s >> EVTLOG_READER_LEN_SHIFT));
	+}
	+
	+static inline evtlog_state_t
	+evtlog_load_state(struct eventlog_percpu_buffer *pcpu)
	+{
	+#ifdef EVENTLOG_HAS_ATOMIC64
	+ return (atomic_load_acq_64(&pcpu->packed_state));
	+#else
	+ evtlog_state_t s;
	+
	+ mtx_lock_spin(&pcpu->swap_lock);
	+ s = pcpu->packed_state;
	+ mtx_unlock_spin(&pcpu->swap_lock);
	+ return (s);
	+#endif
	+}
	+
	+/*
	+ * Atomically advance commit_pos by event_len. Returns true on success;
	+ * on failure, *state is updated to the current packed state so the caller
	+ * can re-derive active and commit_pos and decide whether to redo the write.
	+ */
	+static inline bool
	+evtlog_try_commit(struct eventlog_percpu_buffer *pcpu,
	+ evtlog_state_t *state, uint32_t event_len)
	+{
	+ evtlog_state_t new_state;
	+#ifndef EVENTLOG_HAS_ATOMIC64
	+ bool ok;
	+#endif
	+
	+ new_state = *state + ((uint64_t)event_len << EVTLOG_COMMIT_SHIFT);
	+#ifdef EVENTLOG_HAS_ATOMIC64
	+ return (atomic_fcmpset_64(&pcpu->packed_state, state, new_state));
	+#else
	+ mtx_lock_spin(&pcpu->swap_lock);
	+ if (pcpu->packed_state == *state) {
	+ pcpu->packed_state = new_state;
	+ *state = new_state;
	+ ok = true;
	+ } else {
	+ *state = pcpu->packed_state;
	+ ok = false;
	+ }
	+ mtx_unlock_spin(&pcpu->swap_lock);
	+ return (ok);
	+#endif
	+}
	+
	+/*
	+ * Try to perform a buffer swap atomically. See "Swap publication" in the
	+ * SYNC MODEL at the top of this file for the protocol and the per-impl
	+ * synchronisation primitive.
	+ *
	+ * On success returns true and *old_state is updated to the post-swap
	+ * state (active_buf flipped, commit_pos=0, swap_allowed clear, reader_len
	+ * = pre-swap commit_pos). On failure returns false and *old_state is
	+ * refreshed with the latest observed packed state so the caller can
	+ * re-check capacity after a peer swap.
	+ *
	+ * Precondition: commit_pos > 0 in *old_state.
	+ */
	+static inline bool
	+evtlog_try_swap(struct eventlog_percpu_buffer *pcpu,
	+ evtlog_state_t *old_state)
	+{
	+ evtlog_state_t state = *old_state;
	+ evtlog_state_t new_state;
	+ uint32_t commit;
	+#ifndef EVENTLOG_HAS_ATOMIC64
	+ bool ok;
	+#endif
	+
	+ commit = evtlog_state_commit_pos(state);
	+ MPASS(commit > 0);
	+ new_state = ((state & EVTLOG_ACTIVE_BUF) ^ EVTLOG_ACTIVE_BUF) \|
	+ EVTLOG_PACK_READER_LEN(commit);
	+
	+#ifdef EVENTLOG_HAS_ATOMIC64
	+ if (!atomic_fcmpset_64(&pcpu->packed_state, old_state, new_state))
	+ return (false);
	+#else
	+ mtx_lock_spin(&pcpu->swap_lock);
	+ if (pcpu->packed_state == *old_state) {
	+ pcpu->packed_state = new_state;
	+ ok = true;
	+ } else {
	+ *old_state = pcpu->packed_state;
	+ ok = false;
	+ }
	+ mtx_unlock_spin(&pcpu->swap_lock);
	+ if (!ok)
	+ return (false);
	+#endif
	+ *old_state = new_state;
	+ return (true);
	+}
	+
	+/*
	+ * Mark the reader buffer empty: clear reader_len and set swap_allowed.
	+ * Caller must have just consumed all bytes in the reader buffer
	+ * (read_pos == reader_len) and runs in the single reader thread (never
	+ * NMI), so blocking on the swap lock in the fallback path is safe.
	+ */
	+static inline void
	+evtlog_drain_complete(struct eventlog_percpu_buffer *pcpu_buf)
	+{
	+#ifdef EVENTLOG_HAS_ATOMIC64
	+ uint64_t state, new_state;
	+
	+ pcpu_buf->read_pos = 0;
	+ state = atomic_load_acq_64(&pcpu_buf->packed_state);
	+ do {
	+ new_state = (state & ~EVTLOG_READER_LEN_MASK) \|
	+ EVTLOG_SWAP_ALLOWED;
	+ } while (!atomic_fcmpset_64(&pcpu_buf->packed_state, &state,
	+ new_state));
	+#else
	+ mtx_lock_spin(&pcpu_buf->swap_lock);
	+ pcpu_buf->read_pos = 0;
	+ pcpu_buf->packed_state = (pcpu_buf->packed_state &
	+ ~EVTLOG_READER_LEN_MASK) \| EVTLOG_SWAP_ALLOWED;
	+ mtx_unlock_spin(&pcpu_buf->swap_lock);
	+#endif
	+}
	+
	+/*
	+ * Validate that a buffer contains only complete events (no partial data).
	+ * buffer: pointer to buffer, buffer_size: capacity, start: offset to begin,
	+ * written_len: bytes of data. Call with __LINE__ for panic diagnostics.
	+ */
	+#ifdef INVARIANTS
	+static inline void
	+eventlog_validate_buffer(void *buffer, size_t buffer_size, size_t start,
	+ size_t written_len, int line)
	+{
	+ size_t offset = start;
	+ struct eventlog_event_header hdr;
	+
	+ KASSERT(start <= written_len,
	+ ("%s: start %zu > written_len %zu (caller line %d)",
	+ __func__, start, written_len, line));
	+ KASSERT(written_len <= buffer_size,
	+ ("%s: written_len %zu > buffer_size %zu (caller line %d)",
	+ __func__, written_len, buffer_size, line));
	+ if (written_len == 0)
	+ return;
	+ KASSERT(written_len >= sizeof(struct eventlog_event_header),
	+ ("%s: partial data, written_len %zu < header (line %d)",
	+ __func__, written_len, line));
	+ while (offset < written_len) {
	+ KASSERT(
	+ offset + sizeof(struct eventlog_event_header) <=
	+ written_len,
	+ ("%s: truncated header at offset %zu (line %d)",
	+ __func__, offset, line));
	+ memcpy(&hdr, (const uint8_t *)buffer + offset,
	+ sizeof(struct eventlog_event_header));
	+ KASSERT(hdr.event_length >=
	+ sizeof(struct eventlog_event_header),
	+ ("%s: invalid event_length %u at offset %zu (line %d)",
	+ __func__, hdr.event_length, offset, line));
	+ KASSERT(offset + hdr.event_length <= written_len,
	+ ("%s: event overrun at offset %zu len %u (line %d)",
	+ __func__, offset, hdr.event_length, line));
	+ offset += hdr.event_length;
	+ }
	+ KASSERT(offset == written_len,
	+ ("%s: partial event at end, offset %zu != written_len %zu"
	+ " (caller line %d)",
	+ __func__, offset, written_len, line));
	+}
	+
	+#define EVENTLOG_VALIDATE_READER(pcpu_buf) do { \
	+ evtlog_state_t _vs = evtlog_load_state(pcpu_buf); \
	+ eventlog_validate_buffer( \
	+ (pcpu_buf)->buffers[1 - evtlog_state_active(_vs)], \
	+ (pcpu_buf)->buffer_size, (pcpu_buf)->read_pos, \
	+ evtlog_state_reader_len(_vs), __LINE__); \
	+} while (0)
	+#define EVENTLOG_VALIDATE_WRITER(pcpu_buf) do { \
	+ evtlog_state_t _vs = evtlog_load_state(pcpu_buf); \
	+ eventlog_validate_buffer( \
	+ (pcpu_buf)->buffers[evtlog_state_active(_vs)], \
	+ (pcpu_buf)->buffer_size, 0, \
	+ evtlog_state_commit_pos(_vs), __LINE__); \
	+} while (0)
	+#else
	+#define EVENTLOG_VALIDATE_READER(pcpu_buf) do { } while (0)
	+#define EVENTLOG_VALIDATE_WRITER(pcpu_buf) do { } while (0)
	+#endif
	+
	+static inline uint64_t
	+eventlog_read_timestamp(const void *buf)
	+{
	+ return (((const struct eventlog_event_header *)buf)->timestamp);
	+}
	+
	+/*
	+ * Peek at the next event's timestamp from a CPU buffer's reader buffer.
	+ * Does not advance the buffer read position.
	+ */
	+static EVENTLOG_INLINING uint64_t
	+eventlog_peek_next_timestamp(struct eventlog_percpu_buffer *pcpu_buf)
	+{
	+ const uint8_t *ptr;
	+ int reader = 1 - evtlog_state_active(evtlog_load_state(pcpu_buf));
	+
	+ EVENTLOG_VALIDATE_READER(pcpu_buf);
	+ ptr = (const uint8_t *)pcpu_buf->buffers[reader] + pcpu_buf->read_pos;
	+ return (eventlog_read_timestamp(ptr));
	+}
	+
	+/* Sentinel for timestamp: no next-event timestamp. */
	+#define EVENTLOG_TIMESTAMP_NONE UINT64_MAX
	+/* CPU already checked by resweep during this read (skip next time). */
	+#define EVENTLOG_TIMESTAMP_SWEPT (UINT64_MAX - 1)
	+
	+/* Full definition of eventlog_subscriber (internal only) */
	+CK_LIST_HEAD(eventlog_subscriber_head, eventlog_subscriber);
	+struct eventlog_subscriber {
	+ CK_LIST_ENTRY(eventlog_subscriber) link;
	+ CK_SLIST_HEAD(, eventlog_subscription) subscriptions;
	+ enum eventlog_subscriber_type type;
	+
	+ union {
	+ /* Device-based subscriber: per-CPU buffers */
	+ struct {
	+ struct eventlog_percpu_buffer *percpu_buffers;
	+ uint32_t buffer_size_per_cpu;
	+ /* Atomic: non-zero if reader is waiting. */
	+ volatile uint32_t reader_waiting;
	+ /* [maxcpu] next-event timestamp per CPU. */
	+ uint64_t *cpu_timestamps;
	+ /* Min-heap of CPU indices by timestamp. */
	+ uint16_t *heap_cpus;
	+ uint16_t heap_size; /* Number of CPUs in heap */
	+ } device;
	+ /* Callback-based subscriber: callback function */
	+ struct {
	+ eventlog_callback_t callback;
	+ void *callback_arg;
	+ } callback;
	+ } u;
	+
	+ /*
	+ * Async dump_state coordination. dump_pending counts queued +
	+ * in-flight dump tasks targeting this subscriber; destroy/drain
	+ * waits on the cv until it hits zero. The mtx covers both fields.
	+ */
	+ struct mtx dump_pending_mtx;
	+ struct cv dump_pending_cv;
	+ u_int dump_pending;
	+
	+ /* Statistics */
	+ volatile u_long dropped_events;
	+};
	+
	+/*
	+ * Min-heap of CPU indices ordered by next-event timestamp.
	+ * heap_cpus[0] is the CPU with minimum timestamp when heap_size > 0.
	+ * Stored as implicit binary heap: parent at i, children at 2i+1 and 2i+2.
	+ */
	+
	+/*
	+ * Insert (cpu, timestamp) into the min-heap. O(log n).
	+ */
	+static EVENTLOG_INLINING void
	+eventlog_heap_insert(struct eventlog_subscriber *subscriber, uint16_t cpu,
	+ uint64_t timestamp)
	+{
	+ uint64_t *timestamps = subscriber->u.device.cpu_timestamps;
	+ uint16_t *heap_cpus = subscriber->u.device.heap_cpus;
	+ uint16_t *heap_size = &subscriber->u.device.heap_size;
	+ size_t i;
	+
	+ timestamps[cpu] = timestamp;
	+
	+ if (*heap_size == 0) {
	+ heap_cpus[0] = cpu;
	+ *heap_size = 1;
	+ return;
	+ }
	+
	+ /* Add at end, bubble up */
	+ i = (*heap_size)++;
	+ heap_cpus[i] = cpu;
	+ while (i > 0) {
	+ size_t parent = (i - 1) / 2;
	+ if (timestamps[heap_cpus[parent]] <= timestamps[cpu])
	+ break;
	+ heap_cpus[i] = heap_cpus[parent];
	+ i = parent;
	+ }
	+ heap_cpus[i] = cpu;
	+}
	+
	+/*
	+ * Extract the CPU with minimum timestamp from the heap. Caller must ensure
	+ * heap_size > 0. O(log n).
	+ */
	+static inline void
	+eventlog_heap_extract_min(struct eventlog_subscriber *subscriber)
	+{
	+ uint64_t *timestamps = subscriber->u.device.cpu_timestamps;
	+ uint16_t *heap_cpus = subscriber->u.device.heap_cpus;
	+ uint16_t *heap_size = &subscriber->u.device.heap_size;
	+ uint16_t replaced;
	+ size_t i, smallest;
	+
	+ MPASS(*heap_size > 0);
	+
	+ timestamps[heap_cpus[0]] = EVENTLOG_TIMESTAMP_NONE;
	+
	+ if (*heap_size == 1) {
	+ *heap_size = 0;
	+ return;
	+ }
	+
	+ replaced = heap_cpus[--*heap_size];
	+ heap_cpus[0] = replaced;
	+ i = 0;
	+
	+ /* Heapify down */
	+ while (1) {
	+ size_t left = 2 * i + 1;
	+ size_t right = 2 * i + 2;
	+
	+ smallest = i;
	+ if (left < *heap_size &&
	+ timestamps[heap_cpus[left]] <
	+ timestamps[heap_cpus[smallest]])
	+ smallest = left;
	+ if (right < *heap_size &&
	+ timestamps[heap_cpus[right]] <
	+ timestamps[heap_cpus[smallest]])
	+ smallest = right;
	+
	+ if (smallest == i)
	+ break;
	+
	+ heap_cpus[i] = heap_cpus[smallest];
	+ i = smallest;
	+ }
	+ heap_cpus[i] = replaced;
	+}
	+
	+/*
	+ * Update the root's timestamp (root key increased) and restore heap property.
	+ * Replaces extract_min + heap_insert when we only need to update the root CPU.
	+ * Caller must ensure heap_size > 0.
	+ */
	+static inline void
	+eventlog_heap_update_root(struct eventlog_subscriber *subscriber,
	+ uint64_t new_timestamp)
	+{
	+ uint64_t *timestamps = subscriber->u.device.cpu_timestamps;
	+ uint16_t *heap_cpus = subscriber->u.device.heap_cpus;
	+ uint16_t *heap_size = &subscriber->u.device.heap_size;
	+ uint16_t root_cpu;
	+ size_t i, smallest;
	+
	+ MPASS(*heap_size > 0);
	+
	+ root_cpu = heap_cpus[0];
	+ timestamps[root_cpu] = new_timestamp;
	+ i = 0;
	+
	+ /* Sift down from root */
	+ while (1) {
	+ size_t left = 2 * i + 1;
	+ size_t right = 2 * i + 2;
	+
	+ smallest = i;
	+ if (left < *heap_size &&
	+ timestamps[heap_cpus[left]] <
	+ timestamps[heap_cpus[smallest]])
	+ smallest = left;
	+ if (right < *heap_size &&
	+ timestamps[heap_cpus[right]] <
	+ timestamps[heap_cpus[smallest]])
	+ smallest = right;
	+
	+ if (smallest == i)
	+ break;
	+
	+ heap_cpus[i] = heap_cpus[smallest];
	+ i = smallest;
	+ }
	+ heap_cpus[i] = root_cpu;
	+}
	+
	+/*
	+ * Return the second-smallest timestamp (for max_timestamp bound), or UINT64_MAX
	+ * if heap has fewer than 2 elements.
	+ */
	+static inline uint64_t
	+eventlog_heap_second_min_timestamp(struct eventlog_subscriber *subscriber)
	+{
	+ uint64_t *timestamps = subscriber->u.device.cpu_timestamps;
	+ uint16_t *heap_cpus = subscriber->u.device.heap_cpus;
	+ uint16_t heap_size = subscriber->u.device.heap_size;
	+
	+ if (heap_size < 2)
	+ return (UINT64_MAX);
	+ if (heap_size == 2)
	+ return (timestamps[heap_cpus[1]]);
	+ return (MIN(timestamps[heap_cpus[1]], timestamps[heap_cpus[2]]));
	+}
	+
	+/* Global eventlog state structure */
	+struct eventlog_state {
	+ /* Provider registry */
	+ LIST_HEAD(, eventlog_provider) providers;
	+ LIST_HEAD(, eventlog_provider_stats) provider_stats;
	+ struct mtx providers_lock; /* Protects providers/stats lists */
	+ uint16_t next_provider_id; /* Next ID to assign (1-based) */
	+
	+ /* System-wide device */
	+ struct cdev *device;
	+ smr_t smr; /* SMR domain for subscriber iter. */
	+ struct mtx subscribers_mtx; /* Writer-writer add/remove excl. */
	+ struct eventlog_subscriber_head subscribers;
	+
	+ /* UMA zones */
	+ uma_zone_t session_zone;
	+
	+ /*
	+ * Dump state. dump_tq is single-threaded so dump callbacks
	+ * serialize naturally. While the TQ thread runs a callback it
	+ * publishes (dump_thread, dump_target) so eventlog_event_write_impl
	+ * can route the callback's events to just the requesting subscriber.
	+ * No lock is held: only the TQ thread reads its own publication
	+ * (curthread == dump_thread); the destroy barrier is
	+ * taskqueue_drain_all() in eventlog_provider_destroy().
	+ */
	+ struct thread dump_thread; / Thread running dump callback */
	+ /* Subscriber receiving dump events. */
	+ struct eventlog_subscriber *dump_target;
	+ struct taskqueue *dump_tq;
	+};
	+
	+/* Single instance of global eventlog state */
	+static struct eventlog_state evl = {
	+ .providers = LIST_HEAD_INITIALIZER(evl.providers),
	+ .provider_stats = LIST_HEAD_INITIALIZER(evl.provider_stats),
	+ .device = NULL,
	+ .subscribers = CK_LIST_HEAD_INITIALIZER(evl.subscribers),
	+};
	+
	+/* Initialize mutexes and SMR */
	+static void
	+eventlog_state_init(void *unused)
	+{
	+ mtx_init(&evl.providers_lock, "eventlog providers", NULL, MTX_DEF);
	+ evl.smr = smr_create("eventlog", 0, 0);
	+ mtx_init(&evl.subscribers_mtx, "eventlog subscribers", NULL, MTX_DEF);
	+}
	+SYSINIT(eventlog_state_init, SI_SUB_LOCK, SI_ORDER_ANY,
	+ eventlog_state_init, NULL);
	+
	+/*
	+ * Start the single-threaded dump taskqueue. Serializing dump callbacks
	+ * lets the (dump_thread, dump_target) publication stay lock-free.
	+ */
	+static void
	+eventlog_dump_tq_init(void *unused)
	+{
	+ int err;
	+
	+ evl.dump_tq = taskqueue_create("eventlog_dump", M_WAITOK,
	+ taskqueue_thread_enqueue, &evl.dump_tq);
	+ err = taskqueue_start_threads(&evl.dump_tq, 1, PWAIT,
	+ "eventlog_dump taskq");
	+ if (err != 0)
	+ panic("eventlog: taskqueue_start_threads failed: %d", err);
	+}
	+SYSINIT(eventlog_dump_tq_init, SI_SUB_TASKQ, SI_ORDER_SECOND,
	+ eventlog_dump_tq_init, NULL);
	+
	+/* Initialize UMA zone for sessions */
	+static void
	+eventlog_session_zone_init(void *unused)
	+{
	+ evl.session_zone = uma_zcreate("eventlog_session",
	+ sizeof(struct eventlog_session), NULL, NULL, NULL, NULL,
	+ UMA_ALIGN_PTR, 0);
	+}
	+SYSINIT(eventlog_session_zone, SI_SUB_KMEM, SI_ORDER_ANY,
	+ eventlog_session_zone_init, NULL);
	+
	+/* Forward declarations */
	+static void eventlog_session_update_effective(struct eventlog_session *session,
	+ struct eventlog_provider *provider);
	+static void eventlog_update_provider_enablement(
	+ struct eventlog_provider *provider);
	+static void eventlog_subscriber_write_event(
	+ struct eventlog_subscriber *subscriber,
	+ struct eventlog_session session, struct eventlog_event_header hdr,
	+ const struct iovec *iov, int iovcnt, size_t payload_size,
	+ uint16_t event_length, enum eventlog_level level, uint32_t keywords);
	+static void eventlog_copy_events_from_cpu(
	+ struct eventlog_subscriber *subscriber,
	+ struct eventlog_percpu_buffer pcpu_buf, struct uio uio,
	+ uint64_t max_timestamp, uint64_t *next_timestamp_out,
	+ bool *uio_out_of_space_out);
	+static void eventlog_read_merged(struct eventlog_subscriber *subscriber,
	+ struct uio *uio, uint64_t read_timestamp);
	+static void eventlog_resweep_idle_cpus(struct eventlog_subscriber *subscriber,
	+ uint64_t read_timestamp);
	+
	+/* Kernel sysctl node definitions */
	+SYSCTL_DECL(_kern_eventlog);
	+SYSCTL_NODE(_kern, OID_AUTO, eventlog, CTLFLAG_RD \| CTLFLAG_MPSAFE, NULL,
	+ "Event log subsystem");
	+
	+/*
	+ * Find existing shared statistics for a provider name.
	+ * Caller must hold evl.providers_lock.
	+ * Returns NULL if no stats exist for this name.
	+ */
	+static struct eventlog_provider_stats *
	+eventlog_provider_stats_find(const char *name)
	+{
	+ struct eventlog_provider_stats *stats;
	+
	+ LIST_FOREACH(stats, &evl.provider_stats, link) {
	+ if (strcmp(stats->name, name) == 0) {
	+ stats->refcount++;
	+ return (stats);
	+ }
	+ }
	+ return (NULL);
	+}
	+
	+/*
	+ * Enable or disable all sessions for a single provider instance.
	+ * Holds provider->sessions_lock for the entire iteration.
	+ */
	+static void
	+eventlog_provider_set_all_sessions(struct eventlog_provider *provider,
	+ int enabled)
	+{
	+ struct eventlog_session *session;
	+
	+ mtx_lock(&provider->sessions_lock);
	+ LIST_FOREACH(session, &provider->sessions, link) {
	+ if (session->disabled == (enabled == 0 ? 1 : 0))
	+ continue;
	+ counter_u64_add(provider->stats->sessions_enabled,
	+ (enabled != 0) ? 1 : -1);
	+ session->disabled = (enabled == 0) ? 1 : 0;
	+ eventlog_session_update_effective(session, provider);
	+ }
	+ mtx_unlock(&provider->sessions_lock);
	+}
	+
	+/*
	+ * Sysctl handler for kern.eventlog.<name>.default.
	+ * Values: 0=disabled, 1=enabled, -1=disable all active (set 0),
	+ * 2=enable all disabled (set 1).
	+ */
	+static int
	+sysctl_eventlog_default(SYSCTL_HANDLER_ARGS)
	+{
	+ struct eventlog_provider_stats *stats = arg1;
	+ struct eventlog_provider *provider;
	+ struct eventlog_provider *matched[16];
	+ int nmatched, i, error, val, new_default;
	+
	+ val = stats->default_enabled;
	+ error = sysctl_handle_int(oidp, &val, 0, req);
	+ if (error != 0 \|\| req->newptr == NULL)
	+ return (error);
	+
	+ switch (val) {
	+ case -1:
	+ new_default = 0;
	+ break;
	+ case 0:
	+ case 1:
	+ new_default = val;
	+ break;
	+ case 2:
	+ new_default = 1;
	+ break;
	+ default:
	+ return (EINVAL);
	+ }
	+
	+ stats->default_enabled = new_default;
	+
	+ nmatched = 0;
	+ mtx_lock(&evl.providers_lock);
	+ LIST_FOREACH(provider, &evl.providers, link) {
	+ if (provider->stats == stats && nmatched < 16)
	+ matched[nmatched++] = provider;
	+ }
	+ mtx_unlock(&evl.providers_lock);
	+
	+ for (i = 0; i < nmatched; i++) {
	+ if (matched[i]->default_changed != NULL) {
	+ matched[i]->default_changed(matched[i], val,
	+ matched[i]->default_changed_arg);
	+ } else if (val == -1 \|\| val == 2) {
	+ eventlog_provider_set_all_sessions(matched[i],
	+ (val == 2) ? 1 : 0);
	+ }
	+ }
	+
	+ return (0);
	+}
	+
	+/*
	+ * Allocate a new shared statistics structure. Does not insert into the
	+ * global list — caller must do that under evl.providers_lock after
	+ * re-checking for a concurrent creation. All sleeping allocations
	+ * (malloc, counter_u64_alloc, sysctl) happen here, outside any lock.
	+ */
	+static struct eventlog_provider_stats *
	+eventlog_provider_stats_alloc(const char *name, int default_enabled)
	+{
	+ struct eventlog_provider_stats *stats;
	+ struct sysctl_oid *stats_node;
	+ char tunable_name[64];
	+
	+ stats = malloc(sizeof(*stats), M_EVENTLOG, M_WAITOK \| M_ZERO);
	+ strlcpy(stats->name, name, EVENTLOG_PROVIDER_NAME_MAX);
	+ stats->refcount = 1;
	+ stats->default_enabled = default_enabled;
	+ /*
	+ * Apply the kern.eventlog.<name>.default tunable on top of the
	+ * config default. TUNABLE_INT_FETCH leaves the field alone if the
	+ * tunable is absent.
	+ */
	+ snprintf(tunable_name, sizeof(tunable_name),
	+ "kern.eventlog.%s.default", name);
	+ TUNABLE_INT_FETCH(tunable_name, &stats->default_enabled);
	+ stats->sessions_created = counter_u64_alloc(M_WAITOK);
	+ stats->sessions_active = counter_u64_alloc(M_WAITOK);
	+ stats->sessions_enabled = counter_u64_alloc(M_WAITOK);
	+
	+ sysctl_ctx_init(&stats->sysctl_ctx);
	+ stats_node = SYSCTL_ADD_NODE(&stats->sysctl_ctx,
	+ SYSCTL_STATIC_CHILDREN(_kern_eventlog), OID_AUTO, name,
	+ CTLFLAG_RD \| CTLFLAG_MPSAFE, 0,
	+ "Event log provider statistics");
	+ stats->sysctl_node = stats_node;
	+ SYSCTL_ADD_COUNTER_U64(&stats->sysctl_ctx, SYSCTL_CHILDREN(stats_node),
	+ OID_AUTO, "sessions_created", CTLFLAG_RD, &stats->sessions_created,
	+ "Total sessions ever created successfully");
	+ SYSCTL_ADD_COUNTER_U64(&stats->sysctl_ctx, SYSCTL_CHILDREN(stats_node),
	+ OID_AUTO, "sessions_active", CTLFLAG_RD, &stats->sessions_active,
	+ "Current active session count");
	+ SYSCTL_ADD_COUNTER_U64(&stats->sysctl_ctx, SYSCTL_CHILDREN(stats_node),
	+ OID_AUTO, "sessions_enabled", CTLFLAG_RD, &stats->sessions_enabled,
	+ "Active sessions that are not disabled");
	+ SYSCTL_ADD_PROC(&stats->sysctl_ctx, SYSCTL_CHILDREN(stats_node),
	+ OID_AUTO, "default", CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_MPSAFE,
	+ stats, 0, sysctl_eventlog_default, "I",
	+ "Default enabled: 0=disabled, 1=enabled, -1=disable all active, 2=enable all disabled");
	+
	+ return (stats);
	+}
	+
	+/*
	+ * Free a provider_stats that was never inserted into the global list
	+ * (used when a concurrent creator won the race).
	+ */
	+static void
	+eventlog_provider_stats_free(struct eventlog_provider_stats *stats)
	+{
	+ sysctl_ctx_free(&stats->sysctl_ctx);
	+ counter_u64_free(stats->sessions_created);
	+ counter_u64_free(stats->sessions_active);
	+ counter_u64_free(stats->sessions_enabled);
	+ free(stats, M_EVENTLOG);
	+}
	+
	+/*
	+ * Release a reference to shared provider statistics.
	+ * Removes from the global list when refcount reaches zero, but does NOT
	+ * free — caller must free outside the lock via eventlog_provider_stats_free.
	+ * Caller must hold evl.providers_lock.
	+ * Returns the stats pointer if it should be freed, NULL otherwise.
	+ */
	+static struct eventlog_provider_stats *
	+eventlog_provider_stats_release(struct eventlog_provider_stats *stats)
	+{
	+ if (--stats->refcount > 0)
	+ return (NULL);
	+
	+ LIST_REMOVE(stats, link);
	+ return (stats);
	+}
	+
	+/*
	+ * Create and register a new eventlog provider.
	+ */
	+struct eventlog_provider*
	+eventlog_provider_create(const char *name,
	+ const struct eventlog_provider_config *config)
	+{
	+ static const struct eventlog_provider_config empty_config;
	+ struct eventlog_provider *provider;
	+ struct eventlog_provider_stats *new_stats = NULL;
	+
	+ MPASS(name != NULL);
	+ MPASS(strlen(name) < EVENTLOG_PROVIDER_NAME_MAX);
	+
	+ if (config == NULL)
	+ config = &empty_config;
	+
	+ /* Allocate provider structure */
	+ provider = malloc(sizeof(*provider), M_EVENTLOG, M_WAITOK \| M_ZERO);
	+ strlcpy(provider->name, name, EVENTLOG_PROVIDER_NAME_MAX);
	+ provider->name_len = strlen(provider->name);
	+ provider->dump_callback = config->dump_callback;
	+ provider->dump_callback_arg = config->dump_callback_arg;
	+ provider->default_changed = config->default_changed;
	+ provider->default_changed_arg = config->default_changed_arg;
	+ provider->subscribers_changed = config->subscribers_changed;
	+ provider->subscribers_changed_arg = config->subscribers_changed_arg;
	+ mtx_init(&provider->sessions_lock, "eventlog sessions", NULL, MTX_DEF);
	+ LIST_INIT(&provider->sessions);
	+
	+ /* Fast path: check if stats already exist for this name. */
	+ mtx_lock(&evl.providers_lock);
	+ provider->stats = eventlog_provider_stats_find(name);
	+ if (provider->stats != NULL)
	+ goto insert;
	+ mtx_unlock(&evl.providers_lock);
	+
	+ /*
	+ * Slow path: allocate stats outside the lock, then re-check.
	+ * The first provider for a given name seeds default_enabled;
	+ * later providers reuse the existing stats record (the sysctl
	+ * surface is shared by name).
	+ */
	+ new_stats = eventlog_provider_stats_alloc(name,
	+ config->default_enabled);
	+
	+ mtx_lock(&evl.providers_lock);
	+ provider->stats = eventlog_provider_stats_find(name);
	+ if (provider->stats != NULL) {
	+ /* Another thread created it while we were allocating. */
	+ mtx_unlock(&evl.providers_lock);
	+ eventlog_provider_stats_free(new_stats);
	+ mtx_lock(&evl.providers_lock);
	+ } else {
	+ LIST_INSERT_HEAD(&evl.provider_stats, new_stats, link);
	+ provider->stats = new_stats;
	+ }
	+
	+insert:
	+ /* Assign unique provider_id (1-based; 0 reserved for invalid) */
	+ if (evl.next_provider_id == 0)
	+ evl.next_provider_id = 1;
	+ provider->provider_id = evl.next_provider_id++;
	+ LIST_INSERT_HEAD(&evl.providers, provider, link);
	+ mtx_unlock(&evl.providers_lock);
	+
	+ return (provider);
	+}
	+
	+/*
	+ * Unregister and cleanup an eventlog provider.
	+ */
	+void
	+eventlog_provider_destroy(struct eventlog_provider *provider)
	+{
	+ struct eventlog_provider_stats *dead_stats;
	+
	+ if (provider == NULL)
	+ return;
	+
	+ MPASS(LIST_EMPTY(&provider->sessions));
	+
	+ /*
	+ * Remove from the provider list first so no new subscription
	+ * (and therefore no new dump task) can find us.
	+ */
	+ mtx_lock(&evl.providers_lock);
	+ LIST_REMOVE(provider, link);
	+ dead_stats = eventlog_provider_stats_release(provider->stats);
	+ mtx_unlock(&evl.providers_lock);
	+
	+ /*
	+ * Drain the dump taskqueue: queued or in-flight tasks may still
	+ * reference this provider.
	+ */
	+ taskqueue_drain_all(evl.dump_tq);
	+
	+ if (dead_stats != NULL)
	+ eventlog_provider_stats_free(dead_stats);
	+
	+ mtx_destroy(&provider->sessions_lock);
	+ free(provider, M_EVENTLOG);
	+}
	+
	+/*
	+ * Create a new eventlog session.
	+ * Initial enabled state is derived from the provider's default_enabled.
	+ */
	+struct eventlog_session*
	+eventlog_session_create(struct eventlog_provider *provider,
	+ uint64_t session_id, bool waitok,
	+ void *create_payload, size_t create_payload_size)
	+{
	+ struct bintime bt;
	+ struct eventlog_session *session;
	+ bool enabled;
	+
	+ if (provider == NULL)
	+ return (NULL);
	+
	+ session = uma_zalloc(evl.session_zone,
	+ (waitok ? M_WAITOK : M_NOWAIT) \| M_ZERO);
	+ if (session == NULL)
	+ return (NULL);
	+
	+ enabled = (provider->stats->default_enabled != 0);
	+
	+ binuptime(&bt);
	+ session->created_at = bintime2us(&bt);
	+ session->provider = provider;
	+ session->session_id = session_id;
	+ session->disabled = enabled ? 0 : 1;
	+
	+ counter_u64_add(provider->stats->sessions_created, 1);
	+ counter_u64_add(provider->stats->sessions_active, 1);
	+ if (enabled)
	+ counter_u64_add(provider->stats->sessions_enabled, 1);
	+
	+ /* Add session to provider's list */
	+ mtx_lock(&provider->sessions_lock);
	+ LIST_INSERT_HEAD(&provider->sessions, session, link);
	+ eventlog_session_update_effective(session, provider);
	+ mtx_unlock(&provider->sessions_lock);
	+
	+ /* Emit SESSION_CREATE only when enabled. */
	+ if (enabled && provider->level != EVENTLOG_LEVEL_NONE) {
	+ eventlog_event_write_at(session, EVENTLOG_SESSION_CREATE_ID,
	+ EVENTLOG_LEVEL_INFO, EVENTLOG_KEYWORD_SESSION,
	+ create_payload, create_payload_size,
	+ session->created_at);
	+ }
	+
	+ return (session);
	+}
	+
	+/*
	+ * Destroy an eventlog session.
	+ */
	+void
	+eventlog_session_destroy(struct eventlog_session *session)
	+{
	+ struct eventlog_provider *provider;
	+
	+ if (session == NULL)
	+ return;
	+
	+ provider = session->provider;
	+ MPASS(provider != NULL);
	+
	+ if (session->disabled == 0) {
	+ counter_u64_add(provider->stats->sessions_enabled, -1);
	+ eventlog_event_write(session, EVENTLOG_SESSION_END_ID,
	+ EVENTLOG_LEVEL_INFO, EVENTLOG_KEYWORD_SESSION, NULL, 0);
	+ }
	+
	+ counter_u64_add(provider->stats->sessions_active, -1);
	+
	+ /* Remove session from provider's list */
	+ mtx_lock(&provider->sessions_lock);
	+ LIST_REMOVE(session, link);
	+ mtx_unlock(&provider->sessions_lock);
	+
	+ /* Wait for SMR readers before freeing */
	+ smr_synchronize(evl.smr);
	+ uma_zfree(evl.session_zone, session);
	+}
	+
	+/*
	+ * Query provider level and keywords.
	+ */
	+enum eventlog_level
	+eventlog_provider_get_level(struct eventlog_provider *provider)
	+{
	+ MPASS(provider != NULL);
	+ return (provider->level);
	+}
	+
	+uint32_t
	+eventlog_provider_get_keywords(struct eventlog_provider *provider)
	+{
	+ MPASS(provider != NULL);
	+ return (provider->keywords);
	+}
	+
	+int
	+eventlog_provider_get_default(struct eventlog_provider *provider)
	+{
	+ if (provider == NULL)
	+ return (0);
	+ return (provider->stats->default_enabled);
	+}
	+
	+void
	+eventlog_provider_set_default(struct eventlog_provider *provider, int value)
	+{
	+
	+ MPASS(provider != NULL);
	+ provider->stats->default_enabled = value;
	+}
	+
	+/*
	+ * Return the auto-generated kern.eventlog.<name> sysctl node and its
	+ * context list. Children attached by providers are freed with the
	+ * node, so they must not outlive the provider.
	+ */
	+struct sysctl_oid *
	+eventlog_provider_get_sysctl_node(struct eventlog_provider *provider)
	+{
	+ MPASS(provider != NULL);
	+ return (provider->stats->sysctl_node);
	+}
	+
	+struct sysctl_ctx_list *
	+eventlog_provider_get_sysctl_ctx(struct eventlog_provider *provider)
	+{
	+ MPASS(provider != NULL);
	+ return (&provider->stats->sysctl_ctx);
	+}
	+
	+/*
	+ * Update session's effective_level and effective_keywords from
	+ * disabled/override/provider.
	+ * Caller must hold provider->sessions_lock.
	+ */
	+static void
	+eventlog_session_update_effective(struct eventlog_session *session,
	+ struct eventlog_provider *provider)
	+{
	+ if (session->disabled) {
	+ session->effective_level = EVENTLOG_LEVEL_NONE;
	+ session->effective_keywords = 0;
	+ } else if (session->has_override) {
	+ session->effective_level = session->override_level;
	+ session->effective_keywords = session->override_keywords;
	+ } else {
	+ session->effective_level = provider->level;
	+ session->effective_keywords = provider->keywords;
	+ }
	+}
	+
	+/*
	+ * Enable or disable a session.
	+ */
	+void
	+eventlog_session_set_enabled(struct eventlog_session *session, int enabled)
	+{
	+ struct eventlog_provider *provider;
	+
	+ if (session == NULL)
	+ return;
	+
	+ /* No change - nothing to do */
	+ if (session->disabled == (enabled == 0 ? 1 : 0))
	+ return;
	+
	+ provider = session->provider;
	+ MPASS(provider != NULL);
	+
	+ counter_u64_add(provider->stats->sessions_enabled,
	+ (enabled != 0) ? 1 : -1);
	+ session->disabled = (enabled == 0) ? 1 : 0;
	+
	+ mtx_lock(&provider->sessions_lock);
	+ eventlog_session_update_effective(session, provider);
	+ mtx_unlock(&provider->sessions_lock);
	+}
	+
	+int
	+eventlog_session_is_enabled(struct eventlog_session *session)
	+{
	+ return (session != NULL && session->disabled == 0);
	+}
	+
	+/*
	+ * Set per-session level/keywords override.
	+ */
	+void
	+eventlog_session_set_filter(struct eventlog_session *session,
	+ enum eventlog_level level, uint32_t keywords)
	+{
	+ struct eventlog_provider *provider;
	+
	+ if (session == NULL)
	+ return;
	+
	+ provider = session->provider;
	+ MPASS(provider != NULL);
	+
	+ session->has_override =
	+ (level != EVENTLOG_LEVEL_NONE \|\| keywords != 0) ? 1 : 0;
	+ session->override_level = level;
	+ session->override_keywords = keywords;
	+
	+ mtx_lock(&provider->sessions_lock);
	+ eventlog_session_update_effective(session, provider);
	+ mtx_unlock(&provider->sessions_lock);
	+}
	+
	+/*
	+ * Write an event directly to all relevant subscribers (internal, with
	+ * explicit timestamp). The payload is a scatter/gather iovec; scalar
	+ * callers pass a 1-element iov. payload_size must equal the sum of
	+ * iov[*].iov_len; the caller is responsible for computing it so the
	+ * hot path doesn't need to walk the iov twice.
	+ */
	+static void
	+eventlog_event_write_impl(struct eventlog_session *session, uint32_t id,
	+ enum eventlog_level level, uint32_t keywords,
	+ const struct iovec *iov, int iovcnt,
	+ size_t payload_size, uint64_t timestamp_us)
	+{
	+ struct eventlog_event_header hdr;
	+ struct eventlog_provider *provider;
	+ struct eventlog_subscriber *subscriber;
	+ size_t total_size;
	+
	+ MPASS(session != NULL);
	+ if (__predict_false(session == NULL))
	+ return;
	+
	+ provider = session->provider;
	+ MPASS(provider != NULL);
	+
	+ MPASS(iovcnt >= 0);
	+ MPASS(iovcnt == 0 \|\| iov != NULL);
	+
	+ total_size = sizeof(struct eventlog_event_header) + payload_size;
	+
	+ if (__predict_false(total_size > UINT16_MAX))
	+ return;
	+
	+ hdr.event_length = (uint16_t)total_size;
	+ hdr.RESERVED = 0;
	+ hdr.timestamp = timestamp_us;
	+ hdr.thread_id = (curthread != NULL) ? curthread->td_tid : 0;
	+ hdr.provider_id = provider->provider_id;
	+ hdr.session_id = session->session_id;
	+ hdr.event_id = id;
	+
	+ smr_enter(evl.smr);
	+ hdr.cpu = PCPU_GET(cpuid);
	+
	+ /*
	+ * BUGBUG: It's possible other events raced on a different thread
	+ * with a later timestamp and have already been written.
	+ */
	+
	+ if (__predict_false(evl.dump_target != NULL &&
	+ curthread == evl.dump_thread)) {
	+ eventlog_subscriber_write_event(evl.dump_target, session,
	+ &hdr, iov, iovcnt, payload_size,
	+ (uint16_t)total_size, level, keywords);
	+ } else {
	+ CK_LIST_FOREACH(subscriber, &evl.subscribers, link) {
	+ eventlog_subscriber_write_event(subscriber, session,
	+ &hdr, iov, iovcnt, payload_size,
	+ (uint16_t)total_size, level, keywords);
	+ }
	+ }
	+
	+ smr_exit(evl.smr);
	+}
	+
	+void
	+eventlog_event_write(struct eventlog_session *session, uint32_t id,
	+ enum eventlog_level level, uint32_t keywords, void *buffer, size_t length)
	+{
	+ struct iovec iov = { .iov_base = buffer, .iov_len = length };
	+ struct bintime bt;
	+
	+ binuptime(&bt);
	+ eventlog_event_write_impl(session, id, level, keywords,
	+ &iov, 1, length, bintime2us(&bt));
	+}
	+
	+void
	+eventlog_event_write_at(struct eventlog_session *session, uint32_t id,
	+ enum eventlog_level level, uint32_t keywords, void *buffer, size_t length,
	+ uint64_t timestamp_us)
	+{
	+ struct iovec iov = { .iov_base = buffer, .iov_len = length };
	+
	+ eventlog_event_write_impl(session, id, level, keywords,
	+ &iov, 1, length, timestamp_us);
	+}
	+
	+void
	+eventlog_event_write_gather(struct eventlog_session *session, uint32_t id,
	+ enum eventlog_level level, uint32_t keywords,
	+ const struct iovec *iov, int iovcnt)
	+{
	+ struct bintime bt;
	+ size_t payload_size = 0;
	+ int i;
	+
	+ for (i = 0; i < iovcnt; i++)
	+ payload_size += iov[i].iov_len;
	+ binuptime(&bt);
	+ eventlog_event_write_impl(session, id, level, keywords,
	+ iov, iovcnt, payload_size, bintime2us(&bt));
	+}
	+
	+void
	+eventlog_event_write_gather_at(struct eventlog_session *session, uint32_t id,
	+ enum eventlog_level level, uint32_t keywords,
	+ const struct iovec *iov, int iovcnt, uint64_t timestamp_us)
	+{
	+ size_t payload_size = 0;
	+ int i;
	+
	+ for (i = 0; i < iovcnt; i++)
	+ payload_size += iov[i].iov_len;
	+ eventlog_event_write_impl(session, id, level, keywords,
	+ iov, iovcnt, payload_size, timestamp_us);
	+}
	+
	+/*
	+ * Create a new device-based subscriber with per-CPU buffers.
	+ * buffer_size_per_cpu: Size of buffer to allocate per CPU.
	+ * The subscriber is automatically added to the global subscribers list.
	+ * Returns NULL on failure, subscriber pointer on success.
	+ */
	+struct eventlog_subscriber *
	+eventlog_subscriber_create_device(uint32_t buffer_size_per_cpu)
	+{
	+ struct eventlog_subscriber *subscriber;
	+ struct eventlog_percpu_buffer *percpu_buffers;
	+ int cpu, maxcpu;
	+
	+ if (buffer_size_per_cpu < EVENTLOG_BUFFER_SIZE_MIN \|\|
	+ buffer_size_per_cpu > EVENTLOG_BUFFER_SIZE_MAX)
	+ return (NULL);
	+
	+ /* Allocate subscriber structure */
	+ subscriber = malloc(sizeof(*subscriber), M_EVENTLOG, M_ZERO \| M_WAITOK);
	+ MPASS(subscriber != NULL);
	+
	+ CK_SLIST_INIT(&subscriber->subscriptions);
	+ subscriber->type = EVENTLOG_SUBSCRIBER_TYPE_DEVICE;
	+ subscriber->u.device.buffer_size_per_cpu = buffer_size_per_cpu;
	+ subscriber->u.device.reader_waiting = 0;
	+ subscriber->u.device.heap_size = 0;
	+ mtx_init(&subscriber->dump_pending_mtx, "eventlog dump pending",
	+ NULL, MTX_DEF);
	+ cv_init(&subscriber->dump_pending_cv, "evl_dump");
	+
	+ /* Allocate per-CPU buffers */
	+ maxcpu = mp_maxid + 1;
	+ percpu_buffers = malloc(sizeof(percpu_buffers) maxcpu,
	+ M_EVENTLOG, M_WAITOK \| M_ZERO);
	+ MPASS(percpu_buffers != NULL);
	+ subscriber->u.device.percpu_buffers = percpu_buffers;
	+
	+ /* Allocate cpu_timestamps and heap for merge ordering */
	+ subscriber->u.device.cpu_timestamps = malloc(sizeof(uint64_t) * maxcpu,
	+ M_EVENTLOG, M_WAITOK \| M_ZERO);
	+ MPASS(subscriber->u.device.cpu_timestamps != NULL);
	+ subscriber->u.device.heap_cpus = malloc(sizeof(uint16_t) * maxcpu,
	+ M_EVENTLOG, M_WAITOK \| M_ZERO);
	+ MPASS(subscriber->u.device.heap_cpus != NULL);
	+ for (cpu = 0; cpu < maxcpu; cpu++)
	+ subscriber->u.device.cpu_timestamps[cpu] =
	+ EVENTLOG_TIMESTAMP_NONE;
	+
	+ /* Allocate reader/writer buffers for each CPU */
	+ for (cpu = 0; cpu < maxcpu; cpu++) {
	+ percpu_buffers[cpu].buffer_size = buffer_size_per_cpu;
	+ percpu_buffers[cpu].packed_state = EVTLOG_SWAP_ALLOWED;
	+#ifndef EVENTLOG_HAS_ATOMIC64
	+ mtx_init(&percpu_buffers[cpu].swap_lock,
	+ "eventlog swap", NULL, MTX_SPIN);
	+#endif
	+ percpu_buffers[cpu].buffers[0] = malloc(buffer_size_per_cpu,
	+ M_EVENTLOG, M_WAITOK \| M_ZERO);
	+ MPASS(percpu_buffers[cpu].buffers[0] != NULL);
	+ percpu_buffers[cpu].buffers[1] = malloc(buffer_size_per_cpu,
	+ M_EVENTLOG, M_WAITOK \| M_ZERO);
	+ MPASS(percpu_buffers[cpu].buffers[1] != NULL);
	+ }
	+
	+ /* Add subscriber to global list */
	+ mtx_lock(&evl.subscribers_mtx);
	+ CK_LIST_INSERT_HEAD(&evl.subscribers, subscriber, link);
	+ mtx_unlock(&evl.subscribers_mtx);
	+
	+ return (subscriber);
	+}
	+
	+/*
	+ * Create a new callback-based subscriber.
	+ * callback: Function to call when events arrive.
	+ * callback_arg: Argument to pass to callback function.
	+ * The subscriber is automatically added to the global subscribers list.
	+ * Returns NULL on failure, subscriber pointer on success.
	+ */
	+struct eventlog_subscriber *
	+eventlog_subscriber_create_callback(eventlog_callback_t callback,
	+ void *callback_arg)
	+{
	+ struct eventlog_subscriber *subscriber;
	+
	+ MPASS(callback != NULL);
	+
	+ /* Allocate subscriber structure */
	+ subscriber = malloc(sizeof(*subscriber), M_EVENTLOG, M_ZERO \| M_WAITOK);
	+ MPASS(subscriber != NULL);
	+
	+ CK_SLIST_INIT(&subscriber->subscriptions);
	+ subscriber->type = EVENTLOG_SUBSCRIBER_TYPE_CALLBACK;
	+ subscriber->u.callback.callback = callback;
	+ subscriber->u.callback.callback_arg = callback_arg;
	+ mtx_init(&subscriber->dump_pending_mtx, "eventlog dump pending",
	+ NULL, MTX_DEF);
	+ cv_init(&subscriber->dump_pending_cv, "evl_dump");
	+
	+ /* Add subscriber to global list */
	+ mtx_lock(&evl.subscribers_mtx);
	+ CK_LIST_INSERT_HEAD(&evl.subscribers, subscriber, link);
	+ mtx_unlock(&evl.subscribers_mtx);
	+
	+ return (subscriber);
	+}
	+
	+/*
	+ * Async dump_state machinery. One eventlog_dump_task per (subscriber,
	+ * provider) pair is enqueued on evl.dump_tq; the TQ thread publishes
	+ * (dump_thread, dump_target), invokes provider->dump_callback, then
	+ * decrements subscriber->dump_pending and signals dump_pending_cv.
	+ *
	+ * Subscriber and provider pointers in the task are kept alive by their
	+ * destroy paths draining the TQ before freeing memory.
	+ */
	+struct eventlog_dump_task {
	+ struct task task;
	+ struct eventlog_subscriber *subscriber;
	+ struct eventlog_provider *provider;
	+};
	+
	+/*
	+ * Forward declarations for eventlog_emit_dump_complete(); definitions
	+ * are further down with the rest of the subscriber write path.
	+ */
	+static void eventlog_subscriber_write_event_device(
	+ struct eventlog_subscriber *subscriber,
	+ struct eventlog_provider *provider, uint64_t session_id,
	+ struct eventlog_event_header hdr, const struct iovec iov, int iovcnt,
	+ size_t payload_size);
	+static void eventlog_subscriber_write_event_callback(
	+ struct eventlog_subscriber *subscriber,
	+ struct eventlog_provider *provider, uint64_t session_id,
	+ struct eventlog_event_header hdr, const struct iovec iov, int iovcnt,
	+ size_t payload_size);
	+
	+/*
	+ * Synthesise an EVENTLOG_DUMP_COMPLETE_ID event for `subscriber` once
	+ * `provider`'s dump_callback has finished. session_id is
	+ * EVENTLOG_SESSION_ID_NONE; the level/keyword filter matches
	+ * SESSION_CREATE/SESSION_END.
	+ */
	+static void
	+eventlog_emit_dump_complete(struct eventlog_provider *provider,
	+ struct eventlog_subscriber *subscriber)
	+{
	+ struct eventlog_event_header hdr;
	+ struct eventlog_subscription *sub;
	+ struct iovec iov = { .iov_base = NULL, .iov_len = 0 };
	+ struct bintime bt;
	+ bool match = false;
	+
	+ binuptime(&bt);
	+ hdr.event_length = (uint16_t)sizeof(hdr);
	+ hdr.RESERVED = 0;
	+ hdr.timestamp = bintime2us(&bt);
	+ hdr.thread_id = (curthread != NULL) ? curthread->td_tid : 0;
	+ hdr.provider_id = provider->provider_id;
	+ hdr.session_id = EVENTLOG_SESSION_ID_NONE;
	+ hdr.event_id = EVENTLOG_DUMP_COMPLETE_ID;
	+
	+ smr_enter(evl.smr);
	+ hdr.cpu = PCPU_GET(cpuid);
	+
	+ CK_SLIST_FOREACH(sub, &subscriber->subscriptions, link) {
	+ if (sub->provider == provider) {
	+ if (EVENTLOG_LEVEL_INFO <= sub->level &&
	+ (sub->keywords & EVENTLOG_KEYWORD_SESSION) != 0)
	+ match = true;
	+ break;
	+ }
	+ }
	+
	+ if (match) {
	+ if (subscriber->type == EVENTLOG_SUBSCRIBER_TYPE_DEVICE) {
	+ eventlog_subscriber_write_event_device(subscriber,
	+ provider, EVENTLOG_SESSION_ID_NONE, &hdr, &iov, 0,
	+ 0);
	+ } else {
	+ eventlog_subscriber_write_event_callback(subscriber,
	+ provider, EVENTLOG_SESSION_ID_NONE, &hdr, &iov, 0,
	+ 0);
	+ }
	+ }
	+
	+ smr_exit(evl.smr);
	+}
	+
	+static void
	+eventlog_dump_task_handler(void *context, int pending __unused)
	+{
	+ struct eventlog_dump_task *dt = context;
	+ struct eventlog_subscriber *subscriber = dt->subscriber;
	+ struct eventlog_provider *provider = dt->provider;
	+
	+ /*
	+ * No lock around the publication: the single-threaded TQ is the
	+ * only writer; other threads' curthread != dump_thread so they
	+ * always take the normal subscriber-fanout path regardless of
	+ * any torn read.
	+ */
	+ evl.dump_thread = curthread;
	+ evl.dump_target = subscriber;
	+ provider->dump_callback(provider, provider->dump_callback_arg);
	+ eventlog_emit_dump_complete(provider, subscriber);
	+ evl.dump_target = NULL;
	+ evl.dump_thread = NULL;
	+
	+ mtx_lock(&subscriber->dump_pending_mtx);
	+ KASSERT(subscriber->dump_pending > 0,
	+ ("eventlog: dump_pending underflow on %p", subscriber));
	+ if (--subscriber->dump_pending == 0)
	+ cv_broadcast(&subscriber->dump_pending_cv);
	+ mtx_unlock(&subscriber->dump_pending_mtx);
	+
	+ free(dt, M_EVENTLOG);
	+}
	+
	+/*
	+ * Block until every dump_state task outstanding for this subscriber
	+ * (queued or running) has finished.
	+ */
	+void
	+eventlog_subscriber_drain_dumps(struct eventlog_subscriber *subscriber)
	+{
	+
	+ if (subscriber == NULL)
	+ return;
	+
	+ mtx_lock(&subscriber->dump_pending_mtx);
	+ while (subscriber->dump_pending > 0)
	+ cv_wait(&subscriber->dump_pending_cv,
	+ &subscriber->dump_pending_mtx);
	+ mtx_unlock(&subscriber->dump_pending_mtx);
	+}
	+
	+/*
	+ * Destroy a subscriber and update provider enablement.
	+ */
	+void
	+eventlog_subscriber_destroy(struct eventlog_subscriber *subscriber)
	+{
	+ struct eventlog_subscription sub, sub_next;
	+
	+ if (subscriber == NULL)
	+ return;
	+
	+ /*
	+ * Drain dump tasks first; they reference this subscriber's
	+ * buffers and would UAF if we freed them mid-callback.
	+ */
	+ eventlog_subscriber_drain_dumps(subscriber);
	+
	+ /* Remove subscriber from global list */
	+ mtx_lock(&evl.subscribers_mtx);
	+ CK_LIST_REMOVE(subscriber, link);
	+ mtx_unlock(&evl.subscribers_mtx);
	+
	+ /* Update all provider enablements (we're no longer visible) */
	+ CK_SLIST_FOREACH(sub, &subscriber->subscriptions, link) {
	+ eventlog_update_provider_enablement(sub->provider);
	+ }
	+
	+ /* Wait for all SMR readers before freeing */
	+ smr_synchronize(evl.smr);
	+
	+ /* Free subscriptions, buffers, and subscriber */
	+ CK_SLIST_FOREACH_SAFE(sub, &subscriber->subscriptions, link, sub_next) {
	+ free(sub, M_EVENTLOG);
	+ }
	+
	+ /* Clean up subscriber based on type */
	+ if (subscriber->type == EVENTLOG_SUBSCRIBER_TYPE_DEVICE) {
	+ int cpu, maxcpu = mp_maxid + 1;
	+ struct eventlog_percpu_buffer *percpu_buffers =
	+ subscriber->u.device.percpu_buffers;
	+
	+ if (percpu_buffers != NULL) {
	+ for (cpu = 0; cpu < maxcpu; cpu++) {
	+ if (percpu_buffers[cpu].buffers[0] != NULL)
	+ free(percpu_buffers[cpu].buffers[0],
	+ M_EVENTLOG);
	+ if (percpu_buffers[cpu].buffers[1] != NULL)
	+ free(percpu_buffers[cpu].buffers[1],
	+ M_EVENTLOG);
	+#ifndef EVENTLOG_HAS_ATOMIC64
	+ mtx_destroy(&percpu_buffers[cpu].swap_lock);
	+#endif
	+ }
	+ free(percpu_buffers, M_EVENTLOG);
	+ }
	+ if (subscriber->u.device.cpu_timestamps != NULL)
	+ free(subscriber->u.device.cpu_timestamps, M_EVENTLOG);
	+ if (subscriber->u.device.heap_cpus != NULL)
	+ free(subscriber->u.device.heap_cpus, M_EVENTLOG);
	+ }
	+ /* Callback subscribers don't need cleanup */
	+
	+ cv_destroy(&subscriber->dump_pending_cv);
	+ mtx_destroy(&subscriber->dump_pending_mtx);
	+ free(subscriber, M_EVENTLOG);
	+}
	+
	+/*
	+ * Subscribe to a single provider. Handles both new subscriptions and
	+ * updating existing ones.
	+ *
	+ * On a brand-new subscription (not an in-place update) and only when
	+ * the provider has a dump_callback, enqueue one task on evl.dump_tq
	+ * so the provider can replay current state. Re-subscribing does not
	+ * re-fire the dump.
	+ */
	+static void
	+eventlog_subscriber_add_subscription_one(struct eventlog_subscriber *subscriber,
	+ struct eventlog_provider *provider, enum eventlog_level level,
	+ uint32_t keywords, uint32_t flags)
	+{
	+ struct eventlog_subscription sub, new_sub;
	+ struct eventlog_dump_task *dt;
	+ bool newly_subscribed = false;
	+
	+ new_sub = malloc(sizeof(*new_sub), M_EVENTLOG, M_WAITOK);
	+ MPASS(new_sub != NULL);
	+ new_sub->provider = provider;
	+ new_sub->level = level;
	+ new_sub->keywords = keywords;
	+
	+ mtx_lock(&evl.subscribers_mtx);
	+
	+ CK_SLIST_FOREACH(sub, &subscriber->subscriptions, link) {
	+ if (sub->provider == provider) {
	+ /* Already subscribed; update in place. */
	+ sub->level = level;
	+ sub->keywords = keywords;
	+ mtx_unlock(&evl.subscribers_mtx);
	+ free(new_sub, M_EVENTLOG);
	+ goto update_enablement;
	+ }
	+ }
	+
	+ CK_SLIST_INSERT_HEAD(&subscriber->subscriptions, new_sub, link);
	+ newly_subscribed = true;
	+
	+ mtx_unlock(&evl.subscribers_mtx);
	+
	+update_enablement:
	+
	+ /* Update provider enablement */
	+ eventlog_update_provider_enablement(provider);
	+
	+ if (!newly_subscribed \|\| provider->dump_callback == NULL \|\|
	+ (flags & EVENTLOG_SUBSCRIPTION_DUMP_STATE) == 0)
	+ return;
	+
	+ /*
	+ * First-time subscribe + dump_callback + DUMP_STATE flag:
	+ * enqueue an async dump. Bumping dump_pending under the
	+ * subscriber's mtx ensures a racing destroy() either sees the
	+ * pending count and waits, or finds none yet and our task is
	+ * still scheduled to fire after subscribe returns.
	+ *
	+ * M_NOWAIT: on failure skip the dump rather than block subscribe;
	+ * the live event stream is still delivered.
	+ */
	+ dt = malloc(sizeof(*dt), M_EVENTLOG, M_NOWAIT);
	+ if (dt == NULL)
	+ return;
	+ TASK_INIT(&dt->task, 0, eventlog_dump_task_handler, dt);
	+ dt->subscriber = subscriber;
	+ dt->provider = provider;
	+
	+ mtx_lock(&subscriber->dump_pending_mtx);
	+ subscriber->dump_pending++;
	+ mtx_unlock(&subscriber->dump_pending_mtx);
	+
	+ taskqueue_enqueue(evl.dump_tq, &dt->task);
	+}
	+
	+/*
	+ * Add a subscription to a subscriber.
	+ * Subscribes to ALL providers matching provider_name (multiple providers
	+ * may share the same name, e.g., different TCP stacks each registering "tcp").
	+ * Returns 0 on success, error code on failure.
	+ *
	+ * `flags` is a bitmask of EVENTLOG_SUBSCRIPTION_* values; unknown bits
	+ * return EINVAL. With EVENTLOG_SUBSCRIPTION_DUMP_STATE set, every
	+ * newly-subscribed provider with a dump_callback gets an asynchronous
	+ * dump enqueued; eventlog_subscriber_drain_dumps() waits for them.
	+ */
	+int
	+eventlog_subscriber_add_subscription(struct eventlog_subscriber *subscriber,
	+ const char *provider_name, enum eventlog_level level, uint32_t keywords,
	+ uint32_t flags)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_provider *matched[EVENTLOG_MAX_PROVIDERS];
	+ int nmatched = 0;
	+ int i;
	+
	+ MPASS(subscriber != NULL);
	+ MPASS(provider_name != NULL);
	+
	+ if ((flags & ~EVENTLOG_SUBSCRIPTION_FLAGS_VALID) != 0)
	+ return (EINVAL);
	+
	+ /* Find all providers matching the name */
	+ mtx_lock(&evl.providers_lock);
	+ LIST_FOREACH(provider, &evl.providers, link) {
	+ if (strcmp(provider->name, provider_name) == 0 &&
	+ nmatched < EVENTLOG_MAX_PROVIDERS)
	+ matched[nmatched++] = provider;
	+ }
	+ mtx_unlock(&evl.providers_lock);
	+
	+ if (nmatched == 0)
	+ /* TODO: Support subscribing before provider is registered. */
	+ return (ENOENT);
	+
	+ for (i = 0; i < nmatched; i++)
	+ eventlog_subscriber_add_subscription_one(subscriber,
	+ matched[i], level, keywords, flags);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Update provider enablement based on all active subscribers.
	+ * Keywords are OR'ed, level is MAX (most verbose) of all subscribers.
	+ *
	+ * sessions_lock is held across the recount and per-session update so
	+ * the subscribers_changed callback fires exactly once per real 0<->N
	+ * edge. The callback runs after the lock is dropped.
	+ */
	+static void
	+eventlog_update_provider_enablement(struct eventlog_provider *provider)
	+{
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *subscriber;
	+ struct eventlog_subscription *sub;
	+ enum eventlog_level max_level = EVENTLOG_LEVEL_NONE;
	+ uint32_t or_keywords = 0;
	+ bool has_subscribers = false;
	+ bool transitioned = false;
	+
	+ MPASS(provider != NULL);
	+
	+ mtx_lock(&provider->sessions_lock);
	+
	+ smr_enter(evl.smr);
	+ CK_LIST_FOREACH(subscriber, &evl.subscribers, link) {
	+ CK_SLIST_FOREACH(sub, &subscriber->subscriptions, link) {
	+ if (sub->provider == provider) {
	+ has_subscribers = true;
	+ or_keywords \|= sub->keywords;
	+ if (sub->level > max_level)
	+ max_level = sub->level;
	+ }
	+ }
	+ }
	+ smr_exit(evl.smr);
	+
	+ if (provider->has_subscribers != has_subscribers) {
	+ provider->has_subscribers = has_subscribers;
	+ transitioned = true;
	+ }
	+
	+ /* Update provider enablement */
	+ if (has_subscribers) {
	+ provider->keywords = or_keywords;
	+ provider->level = max_level;
	+ } else {
	+ /* No subscribers - disable provider */
	+ provider->keywords = 0;
	+ provider->level = EVENTLOG_LEVEL_NONE;
	+ }
	+
	+ /* Update all sessions' effective values */
	+ LIST_FOREACH(session, &provider->sessions, link) {
	+ eventlog_session_update_effective(session, provider);
	+ }
	+ mtx_unlock(&provider->sessions_lock);
	+
	+ if (transitioned && provider->subscribers_changed != NULL) {
	+ provider->subscribers_changed(provider, has_subscribers,
	+ provider->subscribers_changed_arg);
	+ }
	+}
	+
	+/*
	+ * Swap buffers for a single CPU if the reader buffer is empty and the
	+ * active buffer has data. Returns true if data is available in the
	+ * reader buffer (either from a swap we performed or a proactive writer
	+ * swap that already completed); false if there is nothing to read.
	+ *
	+ * The swap can lose its CAS to a concurrent writer commit or proactive
	+ * swap, so we loop, re-checking swap_allowed and commit_pos each time.
	+ */
	+static EVENTLOG_INLINING bool
	+eventlog_swap_cpu_buffer_if_needed(struct eventlog_percpu_buffer *pcpu_buf,
	+ int cpu)
	+{
	+ evtlog_state_t state;
	+
	+ state = evtlog_load_state(pcpu_buf);
	+ while (1) {
	+ if (!evtlog_state_swap_allowed(state)) {
	+ MPASS(evtlog_state_reader_len(state) > 0);
	+ return (true);
	+ }
	+
	+ if (evtlog_state_commit_pos(state) == 0)
	+ return (false);
	+
	+ if (evtlog_try_swap(pcpu_buf, &state))
	+ return (true);
	+ /* Lost the swap CAS to a peer; state is refreshed, retry. /
	+ }
	+}
	+
	+/*
	+ * Swap buffers for all CPUs if reader buffer is empty and active buffer
	+ * has data. Builds/preserves the merge heap (min-heap by timestamp) of
	+ * CPUs that have data. CPUs already in the list from a previous call
	+ * have data and are skipped (no swap,
	+ * no reinsert).
	+ */
	+static void
	+eventlog_swap_buffers_if_needed(struct eventlog_subscriber *subscriber)
	+{
	+ int cpu;
	+ struct eventlog_percpu_buffer *pcpu_buf;
	+ uint64_t *timestamps = subscriber->u.device.cpu_timestamps;
	+
	+ MPASS(subscriber->type == EVENTLOG_SUBSCRIBER_TYPE_DEVICE);
	+
	+ for (cpu = 0; cpu <= mp_maxid; cpu++) {
	+ if (timestamps[cpu] < EVENTLOG_TIMESTAMP_SWEPT)
	+ continue; /* In heap */
	+ pcpu_buf = &subscriber->u.device.percpu_buffers[cpu];
	+ if (eventlog_swap_cpu_buffer_if_needed(pcpu_buf, cpu))
	+ eventlog_heap_insert(subscriber, (uint16_t)cpu,
	+ eventlog_peek_next_timestamp(pcpu_buf));
	+ else
	+ timestamps[cpu] = EVENTLOG_TIMESTAMP_NONE;
	+ }
	+}
	+
	+/*
	+ * Read events from a device subscriber's buffer.
	+ * Handles both user-space (UIO_USERSPACE) and kernel (UIO_SYSSPACE) uio.
	+ */
	+int
	+eventlog_subscriber_read(struct eventlog_subscriber *subscriber,
	+ struct uio *uio, int flags)
	+{
	+ struct bintime bt;
	+ uint64_t read_timestamp;
	+ int error = 0;
	+
	+ MPASS(subscriber != NULL);
	+ MPASS(subscriber->type == EVENTLOG_SUBSCRIBER_TYPE_DEVICE);
	+ MPASS(uio != NULL);
	+
	+ if (uio->uio_iovcnt != 1 \|\| uio->uio_resid == 0)
	+ return (EOPNOTSUPP); /* Only one iovec supported */
	+
	+ /* Swap to get latest data, then check if we have anything to read. */
	+ eventlog_swap_buffers_if_needed(subscriber);
	+
	+ if (subscriber->u.device.heap_size == 0) {
	+ if (flags & FNONBLOCK)
	+ return (EAGAIN);
	+
	+ /* Wait for writers to produce data. */
	+ atomic_store_rel_32(&subscriber->u.device.reader_waiting, 1);
	+ error = tsleep(subscriber, PCATCH, "evtlogrd", hz);
	+ atomic_store_rel_32(&subscriber->u.device.reader_waiting, 0);
	+ if (error != 0 && error != EWOULDBLOCK)
	+ return (error);
	+
	+ eventlog_swap_buffers_if_needed(subscriber);
	+ if (subscriber->u.device.heap_size == 0)
	+ return (EAGAIN);
	+ }
	+
	+ binuptime(&bt);
	+ read_timestamp = bintime2us(&bt);
	+
	+ eventlog_read_merged(subscriber, uio, read_timestamp);
	+ return (0);
	+}
	+
	+/*
	+ * Re-sweep CPUs not in the heap after hitting a timestamp boundary.
	+ * Picks up events from preempted writers that committed before read_timestamp
	+ * but whose CPU was previously extracted (no data at extraction time).
	+ */
	+static void
	+eventlog_resweep_idle_cpus(struct eventlog_subscriber *subscriber,
	+ uint64_t read_timestamp)
	+{
	+ int cpu;
	+ struct eventlog_percpu_buffer *pcpu_buf;
	+ uint64_t *timestamps = subscriber->u.device.cpu_timestamps;
	+
	+ for (cpu = 0; cpu <= mp_maxid; cpu++) {
	+ if (timestamps[cpu] != EVENTLOG_TIMESTAMP_NONE)
	+ continue; /* In heap or already swept */
	+ pcpu_buf = &subscriber->u.device.percpu_buffers[cpu];
	+ if (eventlog_swap_cpu_buffer_if_needed(pcpu_buf, cpu)) {
	+ uint64_t ts = eventlog_peek_next_timestamp(pcpu_buf);
	+ if (ts <= read_timestamp) {
	+ eventlog_heap_insert(subscriber, (uint16_t)cpu,
	+ ts);
	+ continue;
	+ }
	+ }
	+ timestamps[cpu] = EVENTLOG_TIMESTAMP_SWEPT;
	+ }
	+}
	+
	+/*
	+ * Merge events from all CPUs in timestamp order, copying via uio.
	+ * Events with timestamps beyond read_timestamp are deferred to the next read.
	+ * Caller must have called eventlog_swap_buffers_if_needed beforehand.
	+ */
	+static EVENTLOG_INLINING void
	+eventlog_read_merged(struct eventlog_subscriber subscriber, struct uio uio,
	+ uint64_t read_timestamp)
	+{
	+ struct eventlog_percpu_buffer *pcpu_buf;
	+ uint64_t *timestamps = subscriber->u.device.cpu_timestamps;
	+
	+ MPASS(subscriber->type == EVENTLOG_SUBSCRIBER_TYPE_DEVICE);
	+ MPASS(subscriber->u.device.heap_size > 0);
	+
	+ /* Take lowest timestamp, copy from that CPU, reinsert when drained. */
	+ while (uio->uio_resid > 0 && subscriber->u.device.heap_size > 0) {
	+ uint16_t current_cpu = subscriber->u.device.heap_cpus[0];
	+ uint64_t max_timestamp, effective_max, next_timestamp;
	+ bool uio_out_of_space;
	+
	+ pcpu_buf = &subscriber->u.device.percpu_buffers[current_cpu];
	+ max_timestamp = eventlog_heap_second_min_timestamp(subscriber);
	+ effective_max = (max_timestamp < read_timestamp) ?
	+ max_timestamp : read_timestamp;
	+
	+ eventlog_copy_events_from_cpu(subscriber, pcpu_buf, uio,
	+ effective_max, &next_timestamp, &uio_out_of_space);
	+
	+ if (uio_out_of_space)
	+ break;
	+
	+ EVENTLOG_VALIDATE_READER(pcpu_buf);
	+ if (evtlog_state_reader_len(evtlog_load_state(pcpu_buf)) ==
	+ pcpu_buf->read_pos) {
	+ MPASS(next_timestamp == 0);
	+
	+ /*
	+ * Reader buffer fully drained. Atomically clear
	+ * reader_len and set swap_allowed in one CAS so
	+ * the upper-32-bit and lower-32-bit updates are
	+ * inseparable from concurrent writer commits.
	+ */
	+ evtlog_drain_complete(pcpu_buf);
	+
	+ if (eventlog_swap_cpu_buffer_if_needed(pcpu_buf,
	+ current_cpu)) {
	+ /*
	+ * Single CPU swapped; update timestamp and
	+ * possibly reinsert.
	+ */
	+ next_timestamp =
	+ eventlog_peek_next_timestamp(pcpu_buf);
	+ if (next_timestamp > read_timestamp) {
	+ eventlog_heap_extract_min(subscriber);
	+ eventlog_resweep_idle_cpus(subscriber,
	+ read_timestamp);
	+ continue;
	+ }
	+ if (next_timestamp <= max_timestamp) {
	+ timestamps[current_cpu] =
	+ next_timestamp;
	+ continue;
	+ }
	+ /* No longer min; update root and sift down. */
	+ eventlog_heap_update_root(subscriber,
	+ next_timestamp);
	+ } else {
	+ /* Buffer drained, no swap: remove from heap. */
	+ eventlog_heap_extract_min(subscriber);
	+ }
	+ continue;
	+ }
	+
	+ if (next_timestamp > read_timestamp) {
	+ /* Remaining events are past the epoch boundary. */
	+ eventlog_heap_extract_min(subscriber);
	+ eventlog_resweep_idle_cpus(subscriber, read_timestamp);
	+ continue;
	+ }
	+
	+ /* Buffer has more data within epoch: update root and sift. */
	+ MPASS(next_timestamp != 0);
	+ eventlog_heap_update_root(subscriber, next_timestamp);
	+ }
	+}
	+
	+/*
	+ * Copy events from a CPU buffer up to a given timestamp threshold.
	+ * UIO_USERSPACE uses copyout; UIO_SYSSPACE uses bcopy directly.
	+ * Stops if we run out of space.
	+ */
	+static EVENTLOG_INLINING void
	+eventlog_copy_events_from_cpu(
	+ struct eventlog_subscriber *subscriber,
	+ struct eventlog_percpu_buffer pcpu_buf, struct uio uio,
	+ uint64_t max_timestamp, uint64_t *next_timestamp_out,
	+ bool *uio_out_of_space_out)
	+{
	+ uint32_t bytes_consumed = 0;
	+ uint64_t next_timestamp;
	+ evtlog_state_t cur_state = evtlog_load_state(pcpu_buf);
	+ int reader = 1 - evtlog_state_active(cur_state);
	+ size_t space_avail = uio->uio_resid;
	+ uint32_t available = evtlog_state_reader_len(cur_state) -
	+ pcpu_buf->read_pos;
	+
	+ MPASS(pcpu_buf != NULL);
	+ MPASS(uio != NULL);
	+ MPASS(next_timestamp_out != NULL);
	+ MPASS(uio_out_of_space_out != NULL);
	+ EVENTLOG_VALIDATE_READER(pcpu_buf);
	+
	+ *uio_out_of_space_out = false;
	+
	+ /* Scan events to compute contiguous batch within max_timestamp. */
	+ do {
	+ struct eventlog_event_header hdr;
	+ uint32_t offset = pcpu_buf->read_pos + bytes_consumed;
	+
	+ MPASS((available - bytes_consumed) >=
	+ sizeof(struct eventlog_event_header));
	+ MPASS(offset < pcpu_buf->buffer_size);
	+ memcpy(&hdr, (uint8_t *)pcpu_buf->buffers[reader] + offset,
	+ sizeof(struct eventlog_event_header));
	+
	+ MPASS(hdr.event_length >= sizeof(struct eventlog_event_header));
	+ MPASS(hdr.event_length <= (available - bytes_consumed));
	+ MPASS(offset + hdr.event_length <= pcpu_buf->buffer_size);
	+ MPASS((available - bytes_consumed - hdr.event_length) == 0 \|\|
	+ (available - bytes_consumed - hdr.event_length)
	+ >= sizeof(struct eventlog_event_header));
	+
	+ next_timestamp = hdr.timestamp;
	+
	+ if (next_timestamp > max_timestamp)
	+ break;
	+
	+ if (bytes_consumed + hdr.event_length > space_avail) {
	+ *uio_out_of_space_out = true;
	+ break;
	+ }
	+
	+ bytes_consumed += hdr.event_length;
	+
	+ } while (available > bytes_consumed);
	+
	+ /* Copy the data into the uio buffer. */
	+ if (bytes_consumed > 0) {
	+ const char *src;
	+
	+ src = (char )((uint8_t )pcpu_buf->buffers[reader] +
	+ pcpu_buf->read_pos);
	+
	+ if (uio->uio_segflg == UIO_USERSPACE) {
	+ KASSERT(THREAD_CAN_SLEEP(),
	+ ("eventlog copyout in non-sleepable context"));
	+ if (copyout(src, uio->uio_iov[0].iov_base,
	+ bytes_consumed) != 0) {
	+ *uio_out_of_space_out = true;
	+ goto out;
	+ }
	+ uioadvance(uio, bytes_consumed);
	+ } else {
	+ bcopy(src, uio->uio_iov[0].iov_base, bytes_consumed);
	+ uioadvance(uio, bytes_consumed);
	+ }
	+
	+ pcpu_buf->read_pos += bytes_consumed;
	+ EVENTLOG_VALIDATE_READER(pcpu_buf);
	+
	+ if (pcpu_buf->read_pos ==
	+ evtlog_state_reader_len(evtlog_load_state(pcpu_buf)))
	+ next_timestamp = 0;
	+ }
	+
	+out:
	+ *next_timestamp_out = next_timestamp;
	+}
	+
	+/*
	+ * Write an event to a device-based subscriber's per-CPU buffer. Format:
	+ * header (includes provider_id, session_id, event_id) + payload.
	+ *
	+ * Implements the writer side of the SYNC MODEL at the top of this file;
	+ * the four numbered steps below correspond to (1)-(4) in that comment.
	+ *
	+ * Wakes the reader only after a proactive swap (a full buffer's worth of
	+ * data just moved into the reader buffer). Normal commits do not wake;
	+ * the reader is woken in batches.
	+ */
	+static EVENTLOG_INLINING void
	+eventlog_subscriber_write_event_device(struct eventlog_subscriber *subscriber,
	+ struct eventlog_provider *provider, uint64_t session_id,
	+ struct eventlog_event_header hdr, const struct iovec iov, int iovcnt,
	+ size_t payload_size)
	+{
	+ struct eventlog_percpu_buffer *pcpu_buf;
	+ uint8_t *buf;
	+ int active;
	+ uint32_t commit_pos;
	+ evtlog_state_t state;
	+ size_t event_len = hdr->event_length;
	+ bool did_swap = false;
	+ int i;
	+
	+ MPASS(subscriber != NULL);
	+ MPASS(provider != NULL);
	+ MPASS(hdr != NULL);
	+ MPASS(subscriber->type == EVENTLOG_SUBSCRIBER_TYPE_DEVICE);
	+ MPASS(hdr->cpu >= 0 && hdr->cpu <= mp_maxid);
	+#ifdef INVARIANTS
	+ size_t expected_length = sizeof(struct eventlog_event_header) +
	+ payload_size;
	+ MPASS(hdr->event_length == expected_length);
	+#endif
	+
	+ pcpu_buf = &subscriber->u.device.percpu_buffers[hdr->cpu];
	+ MPASS(event_len <= pcpu_buf->buffer_size);
	+
	+#ifndef EVENTLOG_HAS_ATOMIC64
	+ /* NMI-on-lock-holder deadlock guard; see SYNC MODEL. */
	+ if (__predict_false(mtx_owned(&pcpu_buf->swap_lock))) {
	+ atomic_add_long(&subscriber->dropped_events, 1);
	+ return;
	+ }
	+#endif
	+
	+ /* (1) Load state to get active buffer and write offset. */
	+ state = evtlog_load_state(pcpu_buf);
	+ active = evtlog_state_active(state);
	+ commit_pos = evtlog_state_commit_pos(state);
	+
	+write:
	+ /*
	+ * (2) Check capacity (re-derived every retry: an NMI or peer may
	+ * have advanced commit_pos since we last loaded it).
	+ */
	+ if (__predict_false(commit_pos + event_len > pcpu_buf->buffer_size)) {
	+ if (!did_swap && evtlog_state_swap_allowed(state)) {
	+ evtlog_try_swap(pcpu_buf, &state);
	+ /*
	+ * *state holds the post-swap packed state regardless
	+ * of who won; re-derive active/commit_pos from it so
	+ * we never write at offset 0 over a peer's event.
	+ */
	+ active = evtlog_state_active(state);
	+ commit_pos = evtlog_state_commit_pos(state);
	+ did_swap = true;
	+ if (__predict_false(commit_pos + event_len >
	+ pcpu_buf->buffer_size)) {
	+ /*
	+ * No room after the swap; a same-CPU NMI
	+ * writer filled the new buffer. Drop.
	+ */
	+ atomic_add_long(&subscriber->dropped_events, 1);
	+ return;
	+ }
	+ } else {
	+ atomic_add_long(&subscriber->dropped_events, 1);
	+ return;
	+ }
	+ }
	+
	+ /* (3) Write: copy header then iov segments at commit_pos. */
	+ buf = (uint8_t *)pcpu_buf->buffers[active] + commit_pos;
	+ memcpy(buf, hdr, sizeof(struct eventlog_event_header));
	+ buf += sizeof(struct eventlog_event_header);
	+ for (i = 0; i < iovcnt; i++) {
	+ if (iov[i].iov_len > 0) {
	+ memcpy(buf, iov[i].iov_base, iov[i].iov_len);
	+ buf += iov[i].iov_len;
	+ }
	+ }
	+
	+ /*
	+ * (4) Commit: CAS to advance commit_pos. If active or commit_pos
	+ * moved (peer swap or NMI commit), our memcpy is at a stale
	+ * offset and we redo the write via `goto write`. Reader drain
	+ * only moves the upper bits or swap_allowed; the memcpy stays
	+ * valid and we just retry the CAS.
	+ */
	+ while (__predict_false(!evtlog_try_commit(pcpu_buf, &state,
	+ (uint32_t)event_len))) {
	+ if (evtlog_state_active(state) != active \|\|
	+ evtlog_state_commit_pos(state) != commit_pos) {
	+ active = evtlog_state_active(state);
	+ commit_pos = evtlog_state_commit_pos(state);
	+ goto write;
	+ }
	+ }
	+
	+ /*
	+ * Wake reader only after a proactive swap - a full buffer's worth
	+ * of data is now in the reader buffer.
	+ */
	+ if (did_swap &&
	+ atomic_cmpset_32(&subscriber->u.device.reader_waiting, 1, 0))
	+ wakeup(subscriber);
	+}
	+
	+/*
	+ * Deliver an event to a callback subscriber. The payload is passed as
	+ * the same scatter/gather iovec the write path carries internally;
	+ * callbacks that need a flat payload compact it themselves.
	+ */
	+static EVENTLOG_INLINING void
	+eventlog_subscriber_write_event_callback(
	+ struct eventlog_subscriber *subscriber,
	+ struct eventlog_provider *provider, uint64_t session_id,
	+ struct eventlog_event_header hdr, const struct iovec iov, int iovcnt,
	+ size_t payload_size)
	+{
	+ MPASS(subscriber->type == EVENTLOG_SUBSCRIBER_TYPE_CALLBACK);
	+ MPASS(subscriber->u.callback.callback != NULL);
	+
	+ subscriber->u.callback.callback(hdr, provider->name,
	+ provider->name_len, session_id, iov, iovcnt, payload_size,
	+ subscriber->u.callback.callback_arg);
	+}
	+
	+/*
	+ * Write an event to a subscriber.
	+ * Checks if subscriber has matching subscription and level/keywords match.
	+ * Routes to device or callback handler based on subscriber type.
	+ */
	+static void
	+eventlog_subscriber_write_event(struct eventlog_subscriber *subscriber,
	+ struct eventlog_session session, struct eventlog_event_header hdr,
	+ const struct iovec *iov, int iovcnt, size_t payload_size,
	+ uint16_t event_length, enum eventlog_level level, uint32_t keywords)
	+{
	+ struct eventlog_subscription *sub;
	+ struct eventlog_provider *provider;
	+
	+ MPASS(subscriber != NULL);
	+ MPASS(session != NULL);
	+ MPASS(session->provider != NULL);
	+ MPASS(hdr != NULL);
	+ MPASS(event_length <= UINT16_MAX);
	+
	+ provider = session->provider;
	+
	+ /* Note: Called within SMR read section. */
	+ CK_SLIST_FOREACH(sub, &subscriber->subscriptions, link) {
	+ if (sub->provider != provider)
	+ continue;
	+ /*
	+ * Only one subscription per provider per subscriber: return
	+ * unconditionally below, even if the filter doesn't match.
	+ */
	+ if (level <= sub->level &&
	+ (keywords & sub->keywords) != 0) {
	+ if (subscriber->type ==
	+ EVENTLOG_SUBSCRIBER_TYPE_DEVICE)
	+ eventlog_subscriber_write_event_device(
	+ subscriber, provider,
	+ session->session_id, hdr, iov, iovcnt,
	+ payload_size);
	+ else
	+ eventlog_subscriber_write_event_callback(
	+ subscriber, provider,
	+ session->session_id, hdr, iov, iovcnt,
	+ payload_size);
	+ }
	+ return;
	+ }
	+}
	+
	+/*
	+ * Query subscriber statistics.
	+ */
	+void
	+eventlog_subscriber_get_stats(struct eventlog_subscriber *subscriber,
	+ struct eventlog_stats *stats)
	+{
	+ MPASS(subscriber != NULL);
	+ MPASS(stats != NULL);
	+
	+ stats->dropped_events = (uint64_t)atomic_load_acq_long(
	+ &subscriber->dropped_events);
	+}
	+
	+/*
	+ * Device operations
	+ */
	+
	+/*
	+ * Device open handler. Subscriber is created via CREATE IOCTL.
	+ * Only prison0 (the host) may open: the eventlog framework is host-global
	+ * and not safe to expose to jailed processes.
	+ */
	+static int
	+eventlog_dev_open(struct cdev *dev, int flags, int devtype __unused,
	+ struct thread *td)
	+{
	+ if (jailed(td->td_ucred))
	+ return (EPERM);
	+
	+ /* Only allow read access */
	+ if (flags & (FWRITE \| FEXEC \| FAPPEND \| O_TRUNC))
	+ return (ENODEV);
	+ return (0);
	+}
	+
	+/*
	+ * Device close handler.
	+ */
	+static int
	+eventlog_dev_close(struct cdev *dev __unused, int flags __unused,
	+ int devtype __unused, struct thread *td __unused)
	+{
	+ return (0); /* Cleanup is handled by eventlog_dev_clear_cdevpriv */
	+}
	+
	+/*
	+ * Cleanup cdevpriv data when device is closed.
	+ */
	+static void
	+eventlog_dev_clear_cdevpriv(void *data)
	+{
	+ /* Handle case where CREATE failed and no subscriber was created */
	+ if (data == NULL)
	+ return;
	+
	+ eventlog_subscriber_destroy((struct eventlog_subscriber *)data);
	+}
	+
	+/*
	+ * Device ioctl handler.
	+ */
	+static int
	+eventlog_dev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int flags,
	+ struct thread *td)
	+{
	+ struct eventlog_subscriber *subscriber;
	+ struct eventlog_subscription_req *sub_req;
	+ uint32_t i;
	+ int error;
	+
	+ switch (IOCBASECMD(cmd)) {
	+ case IOCBASECMD(EVENTLOG_IOCTL_CREATE_BASE): {
	+ size_t base_size;
	+
	+ base_size = __builtin_offsetof(
	+ struct eventlog_create_req, subscriptions);
	+ u_int ioctl_len = IOCPARM_LEN(cmd);
	+ struct eventlog_create_req *req =
	+ (struct eventlog_create_req *)data;
	+
	+ /* Check if subscriber already exists */
	+ error = devfs_get_cdevpriv((void **)&subscriber);
	+ if (error == 0)
	+ return (EEXIST); /* Subscriber already exists */
	+ if (error != ENOENT)
	+ return (error); /* Something weird is going on */
	+
	+ /* Validate request size */
	+ if (ioctl_len < base_size + sizeof(uint32_t) \|\|
	+ ioctl_len < (base_size + req->count *
	+ sizeof(struct eventlog_subscription_req)))
	+ return (EINVAL);
	+
	+ if (req->buffer_size_per_cpu < EVENTLOG_BUFFER_SIZE_MIN \|\|
	+ req->buffer_size_per_cpu > EVENTLOG_BUFFER_SIZE_MAX)
	+ return (EINVAL);
	+
	+ /* Create subscriber with specified buffer size */
	+ subscriber = eventlog_subscriber_create_device(
	+ req->buffer_size_per_cpu);
	+ MPASS(subscriber != NULL);
	+
	+ /* Process each subscription before setting cdevpriv. */
	+ for (i = 0; i < req->count; i++) {
	+ sub_req = &req->subscriptions[i];
	+ error = eventlog_subscriber_add_subscription(
	+ subscriber, sub_req->provider_name, sub_req->level,
	+ sub_req->keywords, sub_req->flags);
	+ if (error != 0) {
	+ eventlog_subscriber_destroy(subscriber);
	+ return (error);
	+ }
	+ }
	+
	+ /* Only store subscriber after all subscriptions succeed. */
	+ error = devfs_set_cdevpriv(subscriber,
	+ eventlog_dev_clear_cdevpriv);
	+ if (error != 0) {
	+ eventlog_subscriber_destroy(subscriber);
	+ return (error);
	+ }
	+
	+ return (0);
	+ }
	+
	+ case IOCBASECMD(EVENTLOG_IOCTL_DESTROY): {
	+ error = devfs_get_cdevpriv((void **)&subscriber);
	+ if (error != 0)
	+ return (error);
	+
	+ eventlog_subscriber_destroy(subscriber);
	+ devfs_set_cdevpriv(NULL, NULL);
	+
	+ return (0);
	+ }
	+
	+ case IOCBASECMD(EVENTLOG_IOCTL_GET_STATS): {
	+ u_int ioctl_len = IOCPARM_LEN(cmd);
	+ if (ioctl_len < sizeof(struct eventlog_stats))
	+ return (EINVAL);
	+
	+ error = devfs_get_cdevpriv((void **)&subscriber);
	+ if (error != 0)
	+ return (error);
	+
	+ eventlog_subscriber_get_stats(subscriber,
	+ (struct eventlog_stats *)data);
	+
	+ return (0);
	+ }
	+
	+ case IOCBASECMD(EVENTLOG_IOCTL_GET_PROVIDERS): {
	+ struct eventlog_get_providers_resp *resp;
	+ struct eventlog_subscription *sub;
	+ uint32_t count = 0;
	+
	+ error = devfs_get_cdevpriv((void **)&subscriber);
	+ if (error != 0)
	+ return (error);
	+
	+ resp = (struct eventlog_get_providers_resp *)data;
	+ memset(resp, 0, sizeof(*resp));
	+
	+ smr_enter(evl.smr);
	+ CK_SLIST_FOREACH(sub, &subscriber->subscriptions, link) {
	+ if (count >= EVENTLOG_MAX_PROVIDERS)
	+ break;
	+ resp->providers[count].provider_id =
	+ sub->provider->provider_id;
	+ strlcpy(resp->providers[count].name,
	+ sub->provider->name,
	+ EVENTLOG_PROVIDER_NAME_MAX);
	+ count++;
	+ }
	+ smr_exit(evl.smr);
	+ resp->count = count;
	+
	+ return (0);
	+ }
	+
	+ default:
	+ return (ENOTTY);
	+ }
	+}
	+
	+/*
	+ * Device read handler - reads from subscriber's per-CPU buffers.
	+ */
	+static int
	+eventlog_dev_read(struct cdev dev, struct uio uio, int flags)
	+{
	+ int error;
	+ struct eventlog_subscriber *subscriber;
	+
	+ error = devfs_get_cdevpriv((void **)&subscriber);
	+ if (error != 0)
	+ return (error);
	+
	+ return (eventlog_subscriber_read(subscriber, uio, flags));
	+}
	+
	+static struct cdevsw eventlog_cdevsw = {
	+ .d_version = D_VERSION,
	+ .d_open = eventlog_dev_open,
	+ .d_close = eventlog_dev_close,
	+ .d_read = eventlog_dev_read,
	+ .d_ioctl = eventlog_dev_ioctl,
	+ .d_name = "eventlog",
	+};
	+
	+/* Initialize single system-wide eventlog device */
	+static void
	+eventlog_device_init(void *unused)
	+{
	+ struct make_dev_args mda;
	+ int error;
	+
	+ make_dev_args_init(&mda);
	+ mda.mda_devsw = &eventlog_cdevsw;
	+ mda.mda_uid = UID_ROOT;
	+ mda.mda_gid = GID_OPERATOR;
	+ mda.mda_mode = 0640;
	+ mda.mda_flags = MAKEDEV_CHECKNAME \| MAKEDEV_WAITOK;
	+ error = make_dev_s(&mda, &evl.device, "eventlog");
	+ if (error != 0) {
	+ printf("eventlog: failed to create device: %d\n", error);
	+ return;
	+ }
	+}
	+SYSINIT(eventlog_device, SI_SUB_DRIVERS, SI_ORDER_MIDDLE,
	+ eventlog_device_init, NULL);
	diff --git a/sys/kern/kern_eventlog_test.c b/sys/kern/kern_eventlog_test.c
	new file mode 100644
	--- /dev/null
	+++ b/sys/kern/kern_eventlog_test.c
	@@ -0,0 +1,5173 @@
	+/*
	+ * Copyright (c) 2026 Netflix, Inc.
	+ *
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ */
	+
	+#include <tests/ktest.h>
	+#include <sys/param.h>
	+#include <sys/systm.h>
	+#include <sys/eventlog.h>
	+#include <sys/eventlog_subscriber.h>
	+#include <sys/sysctl.h>
	+#include <sys/condvar.h>
	+#include <sys/kthread.h>
	+#include <sys/mutex.h>
	+#include <sys/sleepqueue.h>
	+#include <sys/sx.h>
	+#include <sys/malloc.h>
	+#include <sys/time.h>
	+#include <sys/proc.h>
	+#include <sys/uio.h>
	+#include <sys/fcntl.h>
	+#include <machine/atomic.h>
	+#include <sys/callout.h>
	+#include <sys/libkern.h>
	+#include <eventlog/test_eventlog.h>
	+
	+MALLOC_DEFINE(M_EVENTLOG_TEST, "eventlog_test", "eventlog test subsystem");
	+
	+#define KTEST_VERIFY(x) do { \
	+ if (!(x)) { \
	+ KTEST_ERR(ctx, "FAIL: %s", #x); \
	+ return (EINVAL); \
	+ } else { \
	+ KTEST_LOG(ctx, "PASS: %s", #x); \
	+ } \
	+} while (0)
	+
	+#define KTEST_EQUAL(x, y) do { \
	+ if ((x) != (y)) { \
	+ KTEST_ERR(ctx, "FAIL: %s != %s (%d != %d)", #x, #y, (x), (y)); \
	+ return (EINVAL); \
	+ } else { \
	+ KTEST_LOG(ctx, "PASS: %s == %s", #x, #y); \
	+ } \
	+} while (0)
	+
	+#define KTEST_NEQUAL(x, y) do { \
	+ if ((x) == (y)) { \
	+ KTEST_ERR(ctx, "FAIL: %s == %s", #x, #y); \
	+ return (EINVAL); \
	+ } else { \
	+ KTEST_LOG(ctx, "PASS: %s != %s", #x, #y); \
	+ } \
	+} while (0)
	+
	+#define EVENTLOG_SUBSCRIBER_BUFFER_SIZE_DEFAULT (64 * 1024)
	+
	+/*
	+ * Helper: read from subscriber into kernel buffer via uio. Returns bytes read
	+ * or 0 on error.
	+ */
	+static size_t
	+eventlog_read_into_buf(struct eventlog_subscriber *subscriber,
	+ void *buf, size_t bufsize, int flags)
	+{
	+ struct uio uio;
	+ struct iovec iov;
	+ int error;
	+
	+ iov.iov_base = buf;
	+ iov.iov_len = bufsize;
	+ uio.uio_iov = &iov;
	+ uio.uio_iovcnt = 1;
	+ uio.uio_offset = 0;
	+ uio.uio_resid = bufsize;
	+ uio.uio_segflg = UIO_SYSSPACE;
	+ uio.uio_rw = UIO_READ;
	+ uio.uio_td = curthread;
	+
	+ error = eventlog_subscriber_read(subscriber, &uio, flags);
	+ if (error != 0)
	+ return (0);
	+ return (bufsize - uio.uio_resid);
	+}
	+
	+/* Callback test data structure */
	+struct test_callback_data {
	+ volatile uint32_t event_count;
	+ volatile uint32_t last_event_id;
	+ volatile const void *last_payload;
	+ volatile size_t last_payload_size;
	+ /* Only used for reading in test code, not in callback */
	+ struct mtx lock;
	+};
	+
	+/*
	+ * Callback for tests that peek at last_payload after the callback
	+ * returns. Only safe for iovcnt <= 1 where iov[0].iov_base points at
	+ * the caller's buffer; iovcnt > 1 would need to copy.
	+ */
	+static void
	+test_event_callback(const struct eventlog_event_header *hdr,
	+ const char *provider_name, uint8_t provider_name_len,
	+ uint64_t session_id,
	+ const struct iovec *iov, int iovcnt, size_t payload_size,
	+ void *callback_arg)
	+{
	+ struct test_callback_data *data;
	+
	+ data = (struct test_callback_data *)callback_arg;
	+ atomic_add_int(&data->event_count, 1);
	+ atomic_store_rel_32(&data->last_event_id, hdr->event_id);
	+ data->last_payload = (iovcnt >= 1) ? iov[0].iov_base : NULL;
	+ atomic_store_rel_long(&data->last_payload_size, payload_size);
	+}
	+
	+/*
	+ * Helper function to enable a provider for testing by creating a callback
	+ * subscriber and subscription. Returns the subscriber and callback data, which
	+ * should be destroyed after the test completes.
	+ */
	+static struct eventlog_subscriber *
	+test_enable_provider_callback(const char *provider_name,
	+ enum eventlog_level level, uint32_t keywords,
	+ struct test_callback_data **callback_data_out)
	+{
	+ struct eventlog_subscriber *subscriber;
	+ struct test_callback_data *callback_data;
	+
	+ callback_data = malloc(sizeof(*callback_data), M_EVENTLOG_TEST,
	+ M_WAITOK \| M_ZERO);
	+ mtx_init(&callback_data->lock, "test_callback", NULL, MTX_DEF);
	+ callback_data->event_count = 0;
	+
	+ subscriber = eventlog_subscriber_create_callback(test_event_callback,
	+ callback_data);
	+ if (subscriber == NULL) {
	+ mtx_destroy(&callback_data->lock);
	+ free(callback_data, M_EVENTLOG_TEST);
	+ return (NULL);
	+ }
	+
	+ if (eventlog_subscriber_add_subscription(subscriber, provider_name,
	+ level, keywords, 0) != 0) {
	+ eventlog_subscriber_destroy(subscriber);
	+ mtx_destroy(&callback_data->lock);
	+ free(callback_data, M_EVENTLOG_TEST);
	+ return (NULL);
	+ }
	+
	+ *callback_data_out = callback_data;
	+ return (subscriber);
	+}
	+
	+/*
	+ * Helper function to enable a provider for testing by creating a device
	+ * subscriber and subscription. Returns the subscriber, which should be
	+ * destroyed after the test completes. Use this when testing device-specific
	+ * functionality.
	+ */
	+static struct eventlog_subscriber *
	+test_enable_provider_device(const char *provider_name,
	+ enum eventlog_level level, uint32_t keywords)
	+{
	+ struct eventlog_subscriber *subscriber;
	+
	+ subscriber = eventlog_subscriber_create_device(
	+ EVENTLOG_SUBSCRIBER_BUFFER_SIZE_DEFAULT);
	+ if (subscriber == NULL)
	+ return (NULL);
	+
	+ if (eventlog_subscriber_add_subscription(subscriber, provider_name,
	+ level, keywords, 0) != 0) {
	+ eventlog_subscriber_destroy(subscriber);
	+ return (NULL);
	+ }
	+
	+ return (subscriber);
	+}
	+
	+/*
	+ * Legacy helper - defaults to callback for easier verification.
	+ */
	+static struct eventlog_subscriber *
	+test_enable_provider(const char *provider_name, enum eventlog_level level,
	+ uint32_t keywords)
	+{
	+ struct test_callback_data *unused;
	+ return (test_enable_provider_callback(provider_name, level, keywords,
	+ &unused));
	+}
	+
	+static struct eventlog_provider *
	+test_create_provider(const char *name,
	+ eventlog_provider_dump_state_t dump_cb, void *dump_arg)
	+{
	+ struct eventlog_provider_config cfg = {
	+ .dump_callback = dump_cb,
	+ .dump_callback_arg = dump_arg,
	+ };
	+ struct eventlog_provider *p;
	+
	+ p = eventlog_provider_create(name, &cfg);
	+ if (p != NULL)
	+ eventlog_provider_set_default(p, 1);
	+ return (p);
	+}
	+
	+/*
	+ * Validates provider initialization and cleanup.
	+ */
	+KTEST_FUNC(provider_init_cleanup)
	+{
	+ struct eventlog_provider *provider;
	+
	+ KTEST_LOG(ctx, "Testing provider initialization and cleanup");
	+
	+ provider = test_create_provider("test_init", NULL, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+ KTEST_EQUAL(eventlog_provider_get_level(provider), EVENTLOG_LEVEL_NONE);
	+ KTEST_EQUAL(eventlog_provider_get_keywords(provider), 0);
	+
	+ eventlog_provider_destroy(provider);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Validates session creation and destruction.
	+ */
	+KTEST_FUNC(session_create_destroy)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+
	+ KTEST_LOG(ctx, "Testing session creation and destruction");
	+
	+ /* NULL provider returns NULL */
	+ session = eventlog_session_create(NULL, 0, true, NULL, 0);
	+ KTEST_EQUAL(session, NULL);
	+
	+ provider = test_create_provider("test_sess", NULL, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+
	+ session = eventlog_session_create(provider, 0, true, NULL, 0);
	+ KTEST_NEQUAL(session, NULL);
	+
	+ eventlog_session_destroy(session);
	+ eventlog_provider_destroy(provider);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Validates basic event logging functionality.
	+ */
	+KTEST_FUNC(event_logging_basic)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ uint32_t test_id = 0x12345678;
	+ uint32_t test_data = 0xdeadbeef;
	+
	+ KTEST_LOG(ctx, "Testing basic event logging");
	+
	+ provider = test_create_provider("test_basic", NULL, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+ /* Enable provider for testing */
	+ struct eventlog_subscriber *test_sub;
	+
	+ test_sub = test_enable_provider("test_basic", EVENTLOG_LEVEL_VERBOSE,
	+ 0xFFFFFFFF);
	+ KTEST_NEQUAL(test_sub, NULL);
	+ session = eventlog_session_create(provider, 0, true, NULL, 0);
	+ KTEST_NEQUAL(session, NULL);
	+
	+ /* Write event with test data */
	+ eventlog_event_write(session, test_id, EVENTLOG_LEVEL_INFO, 0xFFFFFFFF,
	+ &test_data, sizeof(test_data));
	+
	+ eventlog_session_destroy(session);
	+ eventlog_subscriber_destroy(test_sub);
	+ eventlog_provider_destroy(provider);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Validates multiple events can be logged.
	+ */
	+KTEST_FUNC(event_logging_multiple)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *test_sub;
	+ struct test_callback_data *callback_data;
	+ uint32_t test_id1 = 0x11111111;
	+ uint32_t test_id2 = 0x22222222;
	+ uint32_t test_id3 = 0x33333333;
	+ uint32_t data1 = 0xAAAAAAAA;
	+ uint32_t data2 = 0xBBBBBBBB;
	+ uint32_t data3 = 0xCCCCCCCC;
	+
	+ KTEST_LOG(ctx, "Testing multiple event logging");
	+
	+ provider = test_create_provider("test_multi", NULL, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+ /* Enable provider for testing with callback subscriber */
	+ test_sub = test_enable_provider_callback("test_multi",
	+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, &callback_data);
	+ KTEST_NEQUAL(test_sub, NULL);
	+ session = eventlog_session_create(provider, 0, true, NULL, 0);
	+ KTEST_NEQUAL(session, NULL);
	+
	+ eventlog_event_write(session, test_id1, EVENTLOG_LEVEL_INFO,
	+ 0xFFFFFFFF, &data1, sizeof(data1));
	+ eventlog_event_write(session, test_id2, EVENTLOG_LEVEL_INFO,
	+ 0xFFFFFFFF, &data2, sizeof(data2));
	+ eventlog_event_write(session, test_id3, EVENTLOG_LEVEL_INFO,
	+ 0xFFFFFFFF, &data3, sizeof(data3));
	+
	+ /*
	+ * Verify all three events were received (read then unlock;
	+ * KTEST_EQUAL may sleep)
	+ */
	+ {
	+ uint32_t ec, eid;
	+ uint32_t last_payload_val;
	+ mtx_lock(&callback_data->lock);
	+ ec = atomic_load_acq_32(&callback_data->event_count);
	+ eid = atomic_load_acq_32(&callback_data->last_event_id);
	+ last_payload_val = (volatile const uint32_t )
	+ callback_data->last_payload;
	+ mtx_unlock(&callback_data->lock);
	+ KTEST_EQUAL(ec, 4); /* SESSION_CREATE + 3 user events */
	+ KTEST_EQUAL(eid, test_id3);
	+ KTEST_EQUAL(last_payload_val, data3);
	+ }
	+
	+ eventlog_session_destroy(session);
	+ eventlog_subscriber_destroy(test_sub);
	+ mtx_destroy(&callback_data->lock);
	+ free(callback_data, M_EVENTLOG_TEST);
	+ eventlog_provider_destroy(provider);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Validates multiple providers can coexist.
	+ */
	+KTEST_FUNC(provider_independence)
	+{
	+ struct eventlog_provider provider1, provider2;
	+ struct eventlog_session session1, session2;
	+
	+ KTEST_LOG(ctx, "Testing provider independence");
	+
	+ provider1 = test_create_provider("test_provider1", NULL, NULL);
	+ KTEST_NEQUAL(provider1, NULL);
	+ provider2 = test_create_provider("test_provider2", NULL, NULL);
	+ KTEST_NEQUAL(provider2, NULL);
	+
	+ session1 = eventlog_session_create(provider1, 0, true, NULL, 0);
	+ session2 = eventlog_session_create(provider2, 0, true, NULL, 0);
	+ KTEST_NEQUAL(session1, NULL);
	+ KTEST_NEQUAL(session2, NULL);
	+
	+ eventlog_session_destroy(session1);
	+ eventlog_session_destroy(session2);
	+ eventlog_provider_destroy(provider1);
	+ eventlog_provider_destroy(provider2);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Validates event data integrity - verifies that multiple events are stored
	+ * independently and don't interfere with each other.
	+ */
	+KTEST_FUNC(event_data_integrity)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *test_sub;
	+ struct test_callback_data *callback_data;
	+ uint32_t test_id1 = 0x11111111;
	+ uint32_t test_id2 = 0x22222222;
	+ uint32_t test_id3 = 0x33333333;
	+ uint32_t test_data1[4] = {
	+ 0xAAAAAAAA, 0xBBBBBBBB, 0xCCCCCCCC, 0xDDDDDDDD };
	+ uint32_t test_data2[4] = {
	+ 0x11111111, 0x22222222, 0x33333333, 0x44444444 };
	+ uint32_t test_data3[4] = {
	+ 0x55555555, 0x66666666, 0x77777777, 0x88888888 };
	+ volatile const uint32_t *received_data;
	+ int i;
	+
	+ KTEST_LOG(ctx,
	+ "Testing event data integrity - multiple independent events");
	+
	+ provider = test_create_provider("test_integ", NULL, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+ /* Enable provider for testing with callback subscriber */
	+ test_sub = test_enable_provider_callback("test_integ",
	+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, &callback_data);
	+ KTEST_NEQUAL(test_sub, NULL);
	+ session = eventlog_session_create(provider, 0, true, NULL, 0);
	+ KTEST_NEQUAL(session, NULL);
	+
	+ /* Write events with different data */
	+ eventlog_event_write(session, test_id1, EVENTLOG_LEVEL_INFO,
	+ 0xFFFFFFFF, test_data1, sizeof(test_data1));
	+ eventlog_event_write(session, test_id2, EVENTLOG_LEVEL_INFO,
	+ 0xFFFFFFFF, test_data2, sizeof(test_data2));
	+ eventlog_event_write(session, test_id3, EVENTLOG_LEVEL_INFO,
	+ 0xFFFFFFFF, test_data3, sizeof(test_data3));
	+
	+ /*
	+ * Verify all events were received with correct data (read then unlock;
	+ * KTEST_EQUAL may sleep)
	+ */
	+ {
	+ uint32_t ec, eid;
	+ size_t plen;
	+ uint32_t payload_copy[4];
	+ mtx_lock(&callback_data->lock);
	+ ec = atomic_load_acq_32(&callback_data->event_count);
	+ eid = atomic_load_acq_32(&callback_data->last_event_id);
	+ plen = atomic_load_acq_long(&callback_data->last_payload_size);
	+ received_data = (volatile const uint32_t *)
	+ callback_data->last_payload;
	+ for (i = 0; i < 4; i++)
	+ payload_copy[i] = received_data[i];
	+ mtx_unlock(&callback_data->lock);
	+ KTEST_EQUAL(ec, 4); /* SESSION_CREATE + 3 user events */
	+ KTEST_EQUAL(eid, test_id3);
	+ KTEST_EQUAL(plen, sizeof(test_data3));
	+ for (i = 0; i < 4; i++)
	+ KTEST_EQUAL(payload_copy[i], test_data3[i]);
	+ }
	+
	+ eventlog_session_destroy(session);
	+ eventlog_subscriber_destroy(test_sub);
	+ mtx_destroy(&callback_data->lock);
	+ free(callback_data, M_EVENTLOG_TEST);
	+ eventlog_provider_destroy(provider);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Validates different event sizes - creates events and writes full payloads.
	+ */
	+KTEST_FUNC(event_size_variations)
	+{
	+ struct eventlog_provider provider_small, provider_large;
	+ struct eventlog_session session_small, session_large;
	+ uint32_t test_id_small = 0x1111;
	+ uint32_t test_id_large = 0x2222;
	+ size_t i;
	+ const size_t small_size = 64;
	+ const size_t large_size = 4096;
	+
	+ KTEST_LOG(ctx, "Testing different event sizes with full payloads");
	+
	+ provider_small = test_create_provider("test_small", NULL, NULL);
	+ KTEST_NEQUAL(provider_small, NULL);
	+ provider_large = test_create_provider("test_large", NULL, NULL);
	+ KTEST_NEQUAL(provider_large, NULL);
	+ /* Enable providers for testing with callback subscribers */
	+ struct test_callback_data callback_data_small, callback_data_large;
	+ struct eventlog_subscriber *test_sub_small;
	+ struct eventlog_subscriber *test_sub_large;
	+
	+ test_sub_small = test_enable_provider_callback("test_small",
	+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, &callback_data_small);
	+ test_sub_large = test_enable_provider_callback("test_large",
	+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, &callback_data_large);
	+ KTEST_NEQUAL(test_sub_small, NULL);
	+ KTEST_NEQUAL(test_sub_large, NULL);
	+
	+ session_small = eventlog_session_create(provider_small, 0, true, NULL,
	+ 0);
	+ session_large = eventlog_session_create(provider_large, 0, true, NULL,
	+ 0);
	+ KTEST_NEQUAL(session_small, NULL);
	+ KTEST_NEQUAL(session_large, NULL);
	+
	+ /* Create small event payload */
	+ uint8_t data_small[small_size];
	+ for (i = 0; i < small_size; i++) {
	+ data_small[i] = (uint8_t)(i & 0xFF);
	+ }
	+ eventlog_event_write(session_small, test_id_small, EVENTLOG_LEVEL_INFO,
	+ 0xFFFFFFFF, data_small, sizeof(data_small));
	+
	+ /* Create large event payload */
	+ uint8_t data_large[large_size];
	+ for (i = 0; i < large_size; i++) {
	+ data_large[i] = (uint8_t)((i ^ 0xAA) & 0xFF);
	+ }
	+ eventlog_event_write(session_large, test_id_large, EVENTLOG_LEVEL_INFO,
	+ 0xFFFFFFFF, data_large, sizeof(data_large));
	+
	+ /*
	+ * Verify events were received (read then unlock; KTEST_EQUAL may
	+ * sleep)
	+ */
	+ {
	+ uint32_t ec_small, eid_small, ec_large, eid_large;
	+ size_t plen_small, plen_large;
	+ mtx_lock(&callback_data_small->lock);
	+ ec_small = atomic_load_acq_32(
	+ &callback_data_small->event_count);
	+ eid_small = atomic_load_acq_32(
	+ &callback_data_small->last_event_id);
	+ plen_small = atomic_load_acq_long(
	+ &callback_data_small->last_payload_size);
	+ mtx_unlock(&callback_data_small->lock);
	+ /* SESSION_CREATE + 1 user event */
	+ KTEST_EQUAL(ec_small, 2);
	+ KTEST_EQUAL(eid_small, test_id_small);
	+ KTEST_EQUAL(plen_small, sizeof(data_small));
	+
	+ mtx_lock(&callback_data_large->lock);
	+ ec_large = atomic_load_acq_32(
	+ &callback_data_large->event_count);
	+ eid_large = atomic_load_acq_32(
	+ &callback_data_large->last_event_id);
	+ plen_large = atomic_load_acq_long(
	+ &callback_data_large->last_payload_size);
	+ mtx_unlock(&callback_data_large->lock);
	+ /* SESSION_CREATE + 1 user event */
	+ KTEST_EQUAL(ec_large, 2);
	+ KTEST_EQUAL(eid_large, test_id_large);
	+ KTEST_EQUAL(plen_large, sizeof(data_large));
	+ }
	+
	+ eventlog_session_destroy(session_small);
	+ eventlog_session_destroy(session_large);
	+ eventlog_subscriber_destroy(test_sub_small);
	+ eventlog_subscriber_destroy(test_sub_large);
	+ mtx_destroy(&callback_data_small->lock);
	+ mtx_destroy(&callback_data_large->lock);
	+ free(callback_data_small, M_EVENTLOG_TEST);
	+ free(callback_data_large, M_EVENTLOG_TEST);
	+ eventlog_provider_destroy(provider_small);
	+ eventlog_provider_destroy(provider_large);
	+
	+ return (0);
	+}
	+
	+/* Structure for passing data to thread function */
	+struct mt_test_data {
	+ struct eventlog_session *session;
	+ uint32_t thread_id;
	+ uint32_t num_events;
	+ uint32_t events_created;
	+ struct mtx completion_mtx;
	+ int done;
	+};
	+
	+/* Thread function that creates events */
	+static void
	+mt_event_thread(void *arg)
	+{
	+ struct mt_test_data data = (struct mt_test_data )arg;
	+ struct eventlog_session *session = data->session;
	+ uint32_t event_data[2];
	+ uint32_t i;
	+ uint32_t event_id_base = data->thread_id * 0x10000;
	+
	+ for (i = 0; i < data->num_events; i++) {
	+ /* Write thread ID and event index as data */
	+ event_data[0] = data->thread_id;
	+ event_data[1] = i;
	+
	+ eventlog_event_write(session, event_id_base + i,
	+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF, event_data,
	+ sizeof(event_data));
	+ data->events_created++;
	+ }
	+
	+ /* Signal completion */
	+ mtx_lock(&data->completion_mtx);
	+ data->done = 1;
	+ wakeup(&data->done);
	+ mtx_unlock(&data->completion_mtx);
	+
	+ kthread_exit();
	+}
	+
	+/*
	+ * Validates multi-threaded event logging - creates a thread and has both
	+ * threads create many events concurrently to test for race conditions.
	+ */
	+KTEST_FUNC(multithreaded_logging)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct mt_test_data thread_data;
	+ struct thread *thread;
	+ uint32_t main_thread_id = 0xAAAA;
	+ uint32_t thread_id = 0xBBBB;
	+ uint32_t num_events_per_thread = 100;
	+ uint32_t i;
	+ int error;
	+
	+ KTEST_LOG(ctx, "Testing multi-threaded event logging");
	+
	+ provider = test_create_provider("test_mt", NULL, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+ /* Enable provider for testing with callback subscriber */
	+ struct test_callback_data *callback_data;
	+ struct eventlog_subscriber *test_sub;
	+
	+ test_sub = test_enable_provider_callback("test_mt",
	+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, &callback_data);
	+ KTEST_NEQUAL(test_sub, NULL);
	+ session = eventlog_session_create(provider, 0, true, NULL, 0);
	+ KTEST_NEQUAL(session, NULL);
	+
	+ /* Initialize thread data structure */
	+ bzero(&thread_data, sizeof(thread_data));
	+ thread_data.session = session;
	+ thread_data.thread_id = thread_id;
	+ thread_data.num_events = num_events_per_thread;
	+ thread_data.events_created = 0;
	+ thread_data.done = 0;
	+ mtx_init(&thread_data.completion_mtx, "mt_test", NULL, MTX_DEF);
	+
	+ /* Create the thread */
	+ error = kthread_add(mt_event_thread, &thread_data, NULL, &thread,
	+ 0, 0, "eventlog_mt_test");
	+ KTEST_EQUAL(error, 0);
	+
	+ /* Main thread creates events concurrently with the new thread */
	+ uint32_t main_event_data[2];
	+ for (i = 0; i < num_events_per_thread; i++) {
	+ main_event_data[0] = main_thread_id;
	+ main_event_data[1] = i;
	+ eventlog_event_write(session, main_thread_id + i,
	+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF, main_event_data,
	+ sizeof(main_event_data));
	+ }
	+
	+ /* Wait for thread to complete */
	+ mtx_lock(&thread_data.completion_mtx);
	+ while (thread_data.done == 0) {
	+ msleep(&thread_data.done, &thread_data.completion_mtx, 0,
	+ "mt_wait", 0);
	+ }
	+ mtx_unlock(&thread_data.completion_mtx);
	+
	+ /* Verify thread created expected number of events */
	+ KTEST_EQUAL(thread_data.events_created, num_events_per_thread);
	+
	+ /*
	+ * Verify total events received via callback (read then unlock;
	+ * KTEST_EQUAL may sleep)
	+ */
	+ {
	+ uint32_t ec;
	+ mtx_lock(&callback_data->lock);
	+ ec = atomic_load_acq_32(&callback_data->event_count);
	+ mtx_unlock(&callback_data->lock);
	+ /* SESSION_CREATE + events from 2 threads */
	+ KTEST_EQUAL(ec, 1 + num_events_per_thread * 2);
	+ }
	+
	+ mtx_destroy(&thread_data.completion_mtx);
	+ eventlog_session_destroy(session);
	+ eventlog_subscriber_destroy(test_sub);
	+ mtx_destroy(&callback_data->lock);
	+ free(callback_data, M_EVENTLOG_TEST);
	+ eventlog_provider_destroy(provider);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Validates subscriber creation and destruction for both types.
	+ */
	+KTEST_FUNC(subscriber_create_destroy)
	+{
	+ struct eventlog_subscriber subscriber_device, subscriber_callback;
	+ struct test_callback_data *callback_data;
	+
	+ KTEST_LOG(ctx, "Testing subscriber creation and destruction");
	+
	+ /* Test device subscriber */
	+ subscriber_device = eventlog_subscriber_create_device(
	+ EVENTLOG_SUBSCRIBER_BUFFER_SIZE_DEFAULT);
	+ KTEST_NEQUAL(subscriber_device, NULL);
	+ eventlog_subscriber_destroy(subscriber_device);
	+
	+ /* Test callback subscriber */
	+ callback_data = malloc(sizeof(*callback_data), M_EVENTLOG_TEST,
	+ M_WAITOK \| M_ZERO);
	+ mtx_init(&callback_data->lock, "test_callback", NULL, MTX_DEF);
	+ subscriber_callback = eventlog_subscriber_create_callback(
	+ test_event_callback, callback_data);
	+ KTEST_NEQUAL(subscriber_callback, NULL);
	+ eventlog_subscriber_destroy(subscriber_callback);
	+ mtx_destroy(&callback_data->lock);
	+ free(callback_data, M_EVENTLOG_TEST);
	+
	+ return (0);
	+}
	+
	+
	+/*
	+ * Validates multiple subscribers with the same provider.
	+ */
	+KTEST_FUNC(subscriber_multiple_subscribers)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_subscriber sub1, sub2, *sub3;
	+ struct test_callback_data callback_data2, callback_data3;
	+ int error;
	+
	+ KTEST_LOG(ctx, "Testing multiple subscribers with same provider");
	+
	+ provider = test_create_provider("test_msub", NULL, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+
	+ /* Mix device and callback subscribers */
	+ sub1 = eventlog_subscriber_create_device(
	+ EVENTLOG_SUBSCRIBER_BUFFER_SIZE_DEFAULT);
	+ callback_data2 = malloc(sizeof(*callback_data2), M_EVENTLOG_TEST,
	+ M_WAITOK \| M_ZERO);
	+ mtx_init(&callback_data2->lock, "test_callback2", NULL, MTX_DEF);
	+ sub2 = eventlog_subscriber_create_callback(test_event_callback,
	+ callback_data2);
	+ callback_data3 = malloc(sizeof(*callback_data3), M_EVENTLOG_TEST,
	+ M_WAITOK \| M_ZERO);
	+ mtx_init(&callback_data3->lock, "test_callback3", NULL, MTX_DEF);
	+ sub3 = eventlog_subscriber_create_callback(test_event_callback,
	+ callback_data3);
	+ KTEST_NEQUAL(sub1, NULL);
	+ KTEST_NEQUAL(sub2, NULL);
	+ KTEST_NEQUAL(sub3, NULL);
	+
	+ /* Each subscriber subscribes with different parameters */
	+ error = eventlog_subscriber_add_subscription(sub1, "test_msub",
	+ EVENTLOG_LEVEL_INFO, 0x1, 0);
	+ KTEST_EQUAL(error, 0);
	+
	+ error = eventlog_subscriber_add_subscription(sub2, "test_msub",
	+ EVENTLOG_LEVEL_WARN, 0x2, 0);
	+ KTEST_EQUAL(error, 0);
	+
	+ error = eventlog_subscriber_add_subscription(sub3, "test_msub",
	+ EVENTLOG_LEVEL_VERBOSE, 0x4, 0);
	+ KTEST_EQUAL(error, 0);
	+
	+ /*
	+ * Verify provider enablement: keywords OR'ed (0x1 \| 0x2 \| 0x4 = 0x7),
	+ * level is MAX (most verbose) = VERBOSE
	+ */
	+ KTEST_EQUAL(eventlog_provider_get_keywords(provider), 0x7);
	+ KTEST_EQUAL(eventlog_provider_get_level(provider),
	+ EVENTLOG_LEVEL_VERBOSE);
	+
	+ /* Remove one subscriber */
	+ eventlog_subscriber_destroy(sub2);
	+ mtx_destroy(&callback_data2->lock);
	+ free(callback_data2, M_EVENTLOG_TEST);
	+
	+ /* Provider should still be enabled with remaining subscribers */
	+ /* 0x1 \| 0x4 */
	+ KTEST_EQUAL(eventlog_provider_get_keywords(provider), 0x5);
	+ /* MAX(INFO, VERBOSE) */
	+ KTEST_EQUAL(eventlog_provider_get_level(provider),
	+ EVENTLOG_LEVEL_VERBOSE);
	+
	+ /* Remove all remaining subscribers */
	+ eventlog_subscriber_destroy(sub1);
	+ eventlog_subscriber_destroy(sub3);
	+ mtx_destroy(&callback_data3->lock);
	+ free(callback_data3, M_EVENTLOG_TEST);
	+
	+ /* Provider should be disabled */
	+ KTEST_EQUAL(eventlog_provider_get_keywords(provider), 0);
	+ KTEST_EQUAL(eventlog_provider_get_level(provider), EVENTLOG_LEVEL_NONE);
	+
	+ eventlog_provider_destroy(provider);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Validates provider enablement aggregation (OR keywords, MIN level).
	+ */
	+KTEST_FUNC(subscriber_provider_enablement_aggregation)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_subscriber sub1, sub2;
	+ int error;
	+
	+ KTEST_LOG(ctx, "Testing provider enablement aggregation");
	+
	+ provider = test_create_provider("test_agg", NULL, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+
	+ sub1 = eventlog_subscriber_create_device(
	+ EVENTLOG_SUBSCRIBER_BUFFER_SIZE_DEFAULT);
	+ sub2 = eventlog_subscriber_create_device(
	+ EVENTLOG_SUBSCRIBER_BUFFER_SIZE_DEFAULT);
	+ KTEST_NEQUAL(sub1, NULL);
	+ KTEST_NEQUAL(sub2, NULL);
	+
	+ /* Subscriber 1: INFO level, keywords 0x1 */
	+ error = eventlog_subscriber_add_subscription(sub1, "test_agg",
	+ EVENTLOG_LEVEL_INFO, 0x1, 0);
	+ KTEST_EQUAL(error, 0);
	+ KTEST_EQUAL(eventlog_provider_get_level(provider), EVENTLOG_LEVEL_INFO);
	+ KTEST_EQUAL(eventlog_provider_get_keywords(provider), 0x1);
	+
	+ /* Subscriber 2: WARN level, keywords 0x2 (should give INFO, 0x3) */
	+ error = eventlog_subscriber_add_subscription(sub2, "test_agg",
	+ EVENTLOG_LEVEL_WARN, 0x2, 0);
	+ KTEST_EQUAL(error, 0);
	+ /* MAX(INFO, WARN) */
	+ KTEST_EQUAL(eventlog_provider_get_level(provider), EVENTLOG_LEVEL_INFO);
	+ /* 0x1 \| 0x2 */
	+ KTEST_EQUAL(eventlog_provider_get_keywords(provider), 0x3);
	+
	+ /* Update subscriber 1 to VERBOSE (should give VERBOSE, since MAX) */
	+ error = eventlog_subscriber_add_subscription(sub1, "test_agg",
	+ EVENTLOG_LEVEL_VERBOSE, 0x1, 0);
	+ KTEST_EQUAL(error, 0);
	+ /* MAX(VERBOSE, WARN) */
	+ KTEST_EQUAL(eventlog_provider_get_level(provider),
	+ EVENTLOG_LEVEL_VERBOSE);
	+ /* Still 0x1 \| 0x2 */
	+ KTEST_EQUAL(eventlog_provider_get_keywords(provider), 0x3);
	+
	+ /* Update subscriber 2 to ERROR (should result in VERBOSE, since MAX) */
	+ error = eventlog_subscriber_add_subscription(sub2, "test_agg",
	+ EVENTLOG_LEVEL_ERROR, 0x2, 0);
	+ KTEST_EQUAL(error, 0);
	+ /* MAX(VERBOSE, ERROR) */
	+ KTEST_EQUAL(eventlog_provider_get_level(provider),
	+ EVENTLOG_LEVEL_VERBOSE);
	+ /* Still 0x1 \| 0x2 */
	+ KTEST_EQUAL(eventlog_provider_get_keywords(provider), 0x3);
	+
	+ /* Cleanup */
	+ eventlog_subscriber_destroy(sub1);
	+ eventlog_subscriber_destroy(sub2);
	+ eventlog_provider_destroy(provider);
	+
	+ return (0);
	+}
	+
	+
	+/*
	+ * Validates device subscriber buffer functionality.
	+ */
	+KTEST_FUNC(subscriber_device_buffer)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *subscriber;
	+ uint32_t test_id = 0x12345678;
	+ uint32_t test_data = 0xdeadbeef;
	+
	+ KTEST_LOG(ctx, "Testing device subscriber buffer functionality");
	+
	+ provider = test_create_provider("test_devbuf", NULL, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+ subscriber = test_enable_provider_device("test_devbuf",
	+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF);
	+ KTEST_NEQUAL(subscriber, NULL);
	+ session = eventlog_session_create(provider, 0, true, NULL, 0);
	+ KTEST_NEQUAL(session, NULL);
	+
	+ /* Drain SESSION_CREATE from session creation */
	+ char read_buf[1024];
	+ size_t read;
	+
	+ read = eventlog_read_into_buf(subscriber, read_buf, sizeof(read_buf),
	+ 0);
	+ KTEST_VERIFY(read > 0);
	+ read = eventlog_read_into_buf(subscriber, read_buf, sizeof(read_buf),
	+ 0);
	+ KTEST_EQUAL(read, 0);
	+
	+ /* Write event */
	+ eventlog_event_write(session, test_id, EVENTLOG_LEVEL_INFO, 0xFFFFFFFF,
	+ &test_data, sizeof(test_data));
	+
	+ /* Verify event was written to buffer */
	+ read = eventlog_read_into_buf(subscriber, read_buf, sizeof(read_buf),
	+ 0);
	+ KTEST_VERIFY(read > 0);
	+
	+ /* Verify buffer is cleared after read */
	+ read = eventlog_read_into_buf(subscriber, read_buf, sizeof(read_buf),
	+ 0);
	+ KTEST_EQUAL(read, 0);
	+
	+ eventlog_session_destroy(session);
	+ eventlog_subscriber_destroy(subscriber);
	+ eventlog_provider_destroy(provider);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Validates double-buffering functionality.
	+ * Tests that buffer swapping works correctly and eliminates read/write
	+ * contention.
	+ */
	+KTEST_FUNC(subscriber_circular_buffer)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *subscriber;
	+ uint32_t test_id = 0x12345678;
	+ size_t i;
	+ char *read_buf;
	+ size_t read_buf_size = 256 * 1024;
	+ size_t read;
	+ struct eventlog_stats stats;
	+
	+ KTEST_LOG(ctx, "Testing double-buffering functionality");
	+
	+ read_buf = malloc(read_buf_size, M_EVENTLOG_TEST, M_WAITOK);
	+ KTEST_NEQUAL(read_buf, NULL);
	+
	+ /* Create provider and subscriber */
	+ provider = test_create_provider("test_circ", NULL, NULL);
	+ if (provider == NULL) {
	+ free(read_buf, M_EVENTLOG_TEST);
	+ return (EINVAL);
	+ }
	+
	+ /*
	+ * Use a buffer size (128KB) - above 64KB minimum, triggers reasonable
	+ * swaps
	+ */
	+ subscriber = eventlog_subscriber_create_device(128 * 1024);
	+ if (subscriber == NULL) {
	+ free(read_buf, M_EVENTLOG_TEST);
	+ eventlog_provider_destroy(provider);
	+ return (EINVAL);
	+ }
	+
	+ if (eventlog_subscriber_add_subscription(subscriber, "test_circ",
	+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, 0) != 0) {
	+ free(read_buf, M_EVENTLOG_TEST);
	+ eventlog_subscriber_destroy(subscriber);
	+ eventlog_provider_destroy(provider);
	+ return (EINVAL);
	+ }
	+
	+ session = eventlog_session_create(provider, 0, true, NULL, 0);
	+ if (session == NULL) {
	+ free(read_buf, M_EVENTLOG_TEST);
	+ eventlog_subscriber_destroy(subscriber);
	+ eventlog_provider_destroy(provider);
	+ return (EINVAL);
	+ }
	+
	+ /*
	+ * Calculate expected event size for diagnostics (header includes
	+ * provider_id, session_id)
	+ */
	+ size_t expected_event_size = sizeof(struct eventlog_event_header) +
	+ sizeof(uint32_t);
	+ size_t max_events = (128 * 1024) / expected_event_size;
	+ KTEST_LOG(ctx,
	+ "Expected event size: %zu bytes, buffer size: %zu bytes, "
	+ "max events: %zu",
	+ expected_event_size, (size_t)(128 * 1024), max_events);
	+
	+ /*
	+ * Fill active buffer - SESSION_CREATE is first, then max_events-1
	+ * user events to avoid overflow
	+ */
	+ for (i = 0; i < max_events - 1; i++) {
	+ uint32_t val = (uint32_t)i;
	+ eventlog_event_write(session, test_id + i, EVENTLOG_LEVEL_INFO,
	+ 0xFFFFFFFF, &val, sizeof(val));
	+ }
	+
	+ /* Read all events - this should trigger buffer swap */
	+ eventlog_subscriber_get_stats(subscriber, &stats);
	+ read = eventlog_read_into_buf(subscriber, read_buf, read_buf_size, 0);
	+ KTEST_LOG(ctx, "Read %zu bytes, dropped %llu events",
	+ read, (unsigned long long)stats.dropped_events);
	+ KTEST_VERIFY(read > 0);
	+ /* Should not drop events if buffer is large enough */
	+ KTEST_VERIFY(stats.dropped_events == 0);
	+
	+ /* Verify buffer is cleared after read */
	+ read = eventlog_read_into_buf(subscriber, read_buf, read_buf_size, 0);
	+ KTEST_EQUAL(read, 0);
	+
	+ /*
	+ * Test buffer swap: write events, read some, then write more.
	+ * After swap, writers continue on new active buffer, readers read
	+ * from swapped buffer.
	+ */
	+ for (i = 0; i < 50; i++) {
	+ uint32_t val = (uint32_t)i;
	+ eventlog_event_write(session, test_id + i, EVENTLOG_LEVEL_INFO,
	+ 0xFFFFFFFF, &val, sizeof(val));
	+ }
	+
	+ /* Read half of them - this swaps buffers */
	+ read = eventlog_read_into_buf(subscriber, read_buf, read_buf_size / 2,
	+ 0);
	+ KTEST_VERIFY(read > 0);
	+
	+ /*
	+ * Write more events - these go to the new active buffer (no
	+ * contention with reader).
	+ */
	+ for (i = 50; i < 100; i++) {
	+ uint32_t val = (uint32_t)i;
	+ eventlog_event_write(session, test_id + i, EVENTLOG_LEVEL_INFO,
	+ 0xFFFFFFFF, &val, sizeof(val));
	+ }
	+
	+ /* Read remaining events from reader buffer */
	+ eventlog_subscriber_get_stats(subscriber, &stats);
	+ read = eventlog_read_into_buf(subscriber, read_buf, read_buf_size, 0);
	+ KTEST_VERIFY(read > 0);
	+ KTEST_EQUAL(stats.dropped_events, 0);
	+
	+ eventlog_session_destroy(session);
	+ eventlog_subscriber_destroy(subscriber);
	+ eventlog_provider_destroy(provider);
	+ free(read_buf, M_EVENTLOG_TEST);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Validates callback subscriber functionality.
	+ */
	+KTEST_FUNC(subscriber_callback)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *subscriber;
	+ struct test_callback_data *callback_data;
	+ uint32_t test_id = 0x12345678;
	+ uint32_t test_data = 0xdeadbeef;
	+
	+ KTEST_LOG(ctx, "Testing callback subscriber functionality");
	+
	+ provider = test_create_provider("test_cb", NULL, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+ subscriber = test_enable_provider_callback("test_cb",
	+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, &callback_data);
	+ KTEST_NEQUAL(subscriber, NULL);
	+ session = eventlog_session_create(provider, 0, true, NULL, 0);
	+ KTEST_NEQUAL(session, NULL);
	+
	+ /* Callback already received SESSION_CREATE from session creation */
	+ KTEST_EQUAL(atomic_load_acq_32(&callback_data->event_count), 1);
	+
	+ /* Write event */
	+ eventlog_event_write(session, test_id, EVENTLOG_LEVEL_INFO, 0xFFFFFFFF,
	+ &test_data, sizeof(test_data));
	+
	+ /*
	+ * Verify callback was invoked (read then unlock; KTEST_EQUAL may
	+ * sleep)
	+ */
	+ {
	+ uint32_t ec, eid, last_payload_val;
	+ size_t plen;
	+ mtx_lock(&callback_data->lock);
	+ ec = atomic_load_acq_32(&callback_data->event_count);
	+ eid = atomic_load_acq_32(&callback_data->last_event_id);
	+ plen = atomic_load_acq_long(&callback_data->last_payload_size);
	+ last_payload_val = (volatile const uint32_t )
	+ callback_data->last_payload;
	+ mtx_unlock(&callback_data->lock);
	+ KTEST_EQUAL(ec, 2); /* SESSION_CREATE + 1 user event */
	+ KTEST_EQUAL(eid, test_id);
	+ KTEST_EQUAL(plen, sizeof(test_data));
	+ KTEST_EQUAL(last_payload_val, test_data);
	+ }
	+
	+ eventlog_session_destroy(session);
	+ eventlog_subscriber_destroy(subscriber);
	+ mtx_destroy(&callback_data->lock);
	+ free(callback_data, M_EVENTLOG_TEST);
	+ eventlog_provider_destroy(provider);
	+
	+ return (0);
	+}
	+
	+/* Test data structure for concurrent read/write test */
	+struct concurrent_test_data {
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *subscriber;
	+ volatile int done;
	+ int reader_exited; /* Protected by atomics; used as wait channel */
	+ volatile uint64_t events_written;
	+ volatile uint64_t events_read;
	+ volatile uint64_t bytes_read;
	+ struct mtx lock;
	+};
	+
	+/* Writer thread - continuously writes events */
	+static void
	+concurrent_writer_thread(void *arg)
	+{
	+ struct concurrent_test_data data = (struct concurrent_test_data )arg;
	+ uint32_t test_id = 0x1000;
	+ uint32_t test_data[10];
	+ int i;
	+
	+ for (i = 0; i < 10; i++)
	+ test_data[i] = i;
	+
	+ while (data->done == 0) {
	+ eventlog_event_write(data->session, test_id++,
	+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF, test_data,
	+ sizeof(test_data));
	+ atomic_add_64(&data->events_written, 1);
	+ kern_yield(PRI_UNCHANGED); /* Yield to allow reads */
	+ }
	+
	+ kthread_exit();
	+}
	+
	+/* Reader thread - continuously reads events, triggering swaps */
	+static void
	+concurrent_reader_thread(void *arg)
	+{
	+ struct concurrent_test_data data = (struct concurrent_test_data )arg;
	+ char read_buf[8 * 1024];
	+ size_t read_bytes;
	+
	+ while (data->done == 0) {
	+ read_bytes = eventlog_read_into_buf(data->subscriber, read_buf,
	+ sizeof(read_buf), 0);
	+ if (read_bytes > 0) {
	+ atomic_add_64(&data->bytes_read, read_bytes);
	+ atomic_add_64(&data->events_read, 1);
	+ }
	+ kern_yield(PRI_UNCHANGED); /* Yield to allow writes */
	+ }
	+
	+ atomic_store_rel_int(&data->reader_exited, 1);
	+ wakeup(&data->reader_exited);
	+ kthread_exit();
	+}
	+
	+/*
	+ * Validates double-buffering race conditions.
	+ * Tests concurrent reads and writes with frequent buffer swaps to ensure
	+ * no memory corruption or crashes occur.
	+ */
	+KTEST_FUNC(subscriber_double_buffer_race)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *subscriber;
	+ struct concurrent_test_data test_data;
	+ struct thread writer_thread, reader_thread;
	+ int error;
	+ uint64_t initial_written, initial_read, initial_bytes;
	+
	+ KTEST_LOG(ctx, "Testing double-buffering race conditions");
	+
	+ provider = test_create_provider("test_dbrace", NULL, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+
	+ subscriber = eventlog_subscriber_create_device(128 * 1024);
	+ KTEST_NEQUAL(subscriber, NULL);
	+
	+ if (eventlog_subscriber_add_subscription(subscriber, "test_dbrace",
	+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, 0) != 0) {
	+ eventlog_subscriber_destroy(subscriber);
	+ eventlog_provider_destroy(provider);
	+ return (EINVAL);
	+ }
	+
	+ session = eventlog_session_create(provider, 0, true, NULL, 0);
	+ KTEST_NEQUAL(session, NULL);
	+
	+ bzero(&test_data, sizeof(test_data));
	+ test_data.session = session;
	+ test_data.subscriber = subscriber;
	+ test_data.done = 0;
	+ test_data.reader_exited = 0;
	+ test_data.events_written = 0;
	+ test_data.events_read = 0;
	+ test_data.bytes_read = 0;
	+ mtx_init(&test_data.lock, "concurrent_test", NULL, MTX_DEF);
	+
	+ /* Pre-fill buffer to trigger initial swap */
	+ uint32_t test_id = 0x2000;
	+ uint32_t prefill_data[5];
	+ for (int i = 0; i < 5; i++)
	+ prefill_data[i] = i;
	+
	+ for (int i = 0; i < 50; i++) {
	+ eventlog_event_write(session, test_id++, EVENTLOG_LEVEL_INFO,
	+ 0xFFFFFFFF, prefill_data, sizeof(prefill_data));
	+ }
	+
	+ /* Create writer thread */
	+ error = kthread_add(concurrent_writer_thread, &test_data, NULL,
	+ &writer_thread, 0, 0, "evtlog_writer");
	+ KTEST_EQUAL(error, 0);
	+
	+ /* Create reader thread */
	+ error = kthread_add(concurrent_reader_thread, &test_data, NULL,
	+ &reader_thread, 0, 0, "evtlog_reader");
	+ KTEST_EQUAL(error, 0);
	+
	+ /* Let threads run for a bit to exercise race conditions */
	+ tsleep(&test_data, 0, "test_run", hz / 2); /* 500ms */
	+
	+ initial_written = atomic_load_acq_64(&test_data.events_written);
	+ initial_read = atomic_load_acq_64(&test_data.events_read);
	+ initial_bytes = atomic_load_acq_64(&test_data.bytes_read);
	+
	+ KTEST_LOG(ctx,
	+ "After 500ms: wrote %llu events, read %llu times, %llu bytes",
	+ (unsigned long long)initial_written,
	+ (unsigned long long)initial_read,
	+ (unsigned long long)initial_bytes);
	+
	+ /* Continue for another period to ensure stability */
	+ tsleep(&test_data, 0, "test_run2", hz / 2); /* Another 500ms */
	+
	+ uint64_t final_written = atomic_load_acq_64(&test_data.events_written);
	+ uint64_t final_read = atomic_load_acq_64(&test_data.events_read);
	+ uint64_t final_bytes = atomic_load_acq_64(&test_data.bytes_read);
	+
	+ KTEST_LOG(ctx,
	+ "After 1s: wrote %llu events, read %llu times, %llu bytes",
	+ (unsigned long long)final_written,
	+ (unsigned long long)final_read,
	+ (unsigned long long)final_bytes);
	+
	+ /* Verify progress was made */
	+ KTEST_VERIFY(final_written > initial_written);
	+ KTEST_VERIFY(final_bytes > initial_bytes);
	+
	+ /*
	+ * Stop threads - wake reader if blocked, wait for it to exit (single
	+ * reader)
	+ */
	+ test_data.done = 1;
	+ wakeup(subscriber);
	+ while (atomic_load_acq_int(&test_data.reader_exited) == 0)
	+ tsleep(&test_data.reader_exited, 0, "evtlog_rdwait", hz / 10);
	+
	+ /* Drain remaining events (reader has exited, single reader) */
	+ {
	+ char drain_buf[8 * 1024];
	+ size_t drain_read;
	+
	+ do {
	+ drain_read = eventlog_read_into_buf(subscriber,
	+ drain_buf, sizeof(drain_buf), 0);
	+ } while (drain_read > 0);
	+ }
	+
	+ mtx_destroy(&test_data.lock);
	+ eventlog_session_destroy(session);
	+ eventlog_subscriber_destroy(subscriber);
	+ eventlog_provider_destroy(provider);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Validates mid-read buffer swap scenario.
	+ * Tests the case where a buffer drains during a read operation and triggers
	+ * a swap, ensuring the swap happens correctly and ordering is maintained.
	+ */
	+KTEST_FUNC(subscriber_mid_read_swap)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *subscriber;
	+ uint32_t test_id = 0x3000;
	+ uint32_t test_data[10];
	+ char *read_buf;
	+ size_t read_buf_size = 64 * 1024;
	+ ssize_t read_bytes;
	+ struct eventlog_stats stats;
	+ int i;
	+
	+ KTEST_LOG(ctx, "Testing mid-read buffer swap scenario");
	+
	+ /* malloc to avoid stack overflow in ktest taskqueue (small stack) */
	+ read_buf = malloc(read_buf_size, M_EVENTLOG_TEST, M_WAITOK);
	+ KTEST_NEQUAL(read_buf, NULL);
	+
	+ provider = test_create_provider("test_midswap", NULL, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+
	+ subscriber = eventlog_subscriber_create_device(128 * 1024);
	+ KTEST_NEQUAL(subscriber, NULL);
	+
	+ if (eventlog_subscriber_add_subscription(subscriber, "test_midswap",
	+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, 0) != 0) {
	+ free(read_buf, M_EVENTLOG_TEST);
	+ eventlog_subscriber_destroy(subscriber);
	+ eventlog_provider_destroy(provider);
	+ return (EINVAL);
	+ }
	+
	+ session = eventlog_session_create(provider, 0, true, NULL, 0);
	+ if (session == NULL) {
	+ free(read_buf, M_EVENTLOG_TEST);
	+ eventlog_subscriber_destroy(subscriber);
	+ eventlog_provider_destroy(provider);
	+ return (EINVAL);
	+ }
	+
	+ for (i = 0; i < 10; i++)
	+ test_data[i] = i;
	+
	+ /* Fill buffer with events */
	+ for (i = 0; i < 100; i++) {
	+ eventlog_event_write(session, test_id + i, EVENTLOG_LEVEL_INFO,
	+ 0xFFFFFFFF, test_data, sizeof(test_data));
	+ }
	+
	+ /* Read a small chunk - this should trigger swap */
	+ eventlog_subscriber_get_stats(subscriber, &stats);
	+ read_bytes = eventlog_read_into_buf(subscriber, read_buf, 1024, 0);
	+ KTEST_VERIFY(read_bytes > 0);
	+ KTEST_LOG(ctx, "First read: %zd bytes, dropped %llu", read_bytes,
	+ (unsigned long long)stats.dropped_events);
	+
	+ /* Write more events while reader buffer is being drained */
	+ for (i = 100; i < 200; i++) {
	+ eventlog_event_write(session, test_id + i, EVENTLOG_LEVEL_INFO,
	+ 0xFFFFFFFF, test_data, sizeof(test_data));
	+ }
	+
	+ /*
	+ * Continue reading - this should drain the reader buffer and trigger
	+ * swap.
	+ */
	+ eventlog_subscriber_get_stats(subscriber, &stats);
	+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
	+ read_buf_size, 0);
	+ KTEST_VERIFY(read_bytes > 0);
	+ KTEST_LOG(ctx, "Second read: %zd bytes, dropped %llu", read_bytes,
	+ (unsigned long long)stats.dropped_events);
	+
	+ /* Write more events after swap */
	+ for (i = 200; i < 250; i++) {
	+ eventlog_event_write(session, test_id + i, EVENTLOG_LEVEL_INFO,
	+ 0xFFFFFFFF, test_data, sizeof(test_data));
	+ }
	+
	+ /* Read remaining events - should get events from swapped buffer */
	+ eventlog_subscriber_get_stats(subscriber, &stats);
	+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
	+ read_buf_size, 0);
	+ KTEST_VERIFY(read_bytes > 0);
	+ KTEST_LOG(ctx, "Third read: %zd bytes, dropped %llu", read_bytes,
	+ (unsigned long long)stats.dropped_events);
	+
	+ /* Verify buffer is empty */
	+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
	+ read_buf_size, 0);
	+ KTEST_EQUAL(read_bytes, 0);
	+
	+ eventlog_session_destroy(session);
	+ eventlog_subscriber_destroy(subscriber);
	+ eventlog_provider_destroy(provider);
	+ free(read_buf, M_EVENTLOG_TEST);
	+
	+ return (0);
	+}
	+
	+/* Test data for buffer boundary stress test */
	+struct boundary_test_data {
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *subscriber;
	+ volatile int done;
	+ int reader_exited; /* Protected by atomics; used as wait channel */
	+ volatile uint64_t events_written;
	+ struct mtx lock;
	+};
	+
	+/* Writer thread that fills buffers exactly to boundaries */
	+static void
	+boundary_writer_thread(void *arg)
	+{
	+ struct boundary_test_data data = (struct boundary_test_data )arg;
	+ uint32_t test_id = 0x4000;
	+ uint32_t small_data[1] = {0xdeadbeef};
	+ uint32_t large_data[100];
	+ int i;
	+
	+ for (i = 0; i < 100; i++)
	+ large_data[i] = i;
	+
	+ while (data->done == 0) {
	+ /* Write small events to fill buffer precisely */
	+ eventlog_event_write(data->session, test_id++,
	+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF, small_data,
	+ sizeof(small_data));
	+ atomic_add_64(&data->events_written, 1);
	+
	+ /* Occasionally write larger events to test boundaries */
	+ if ((test_id % 10) == 0) {
	+ eventlog_event_write(data->session, test_id++,
	+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF, large_data,
	+ sizeof(large_data));
	+ atomic_add_64(&data->events_written, 1);
	+ }
	+
	+ kern_yield(PRI_UNCHANGED);
	+ }
	+
	+ kthread_exit();
	+}
	+
	+/* Reader thread that rapidly reads and triggers swaps */
	+static void
	+boundary_reader_thread(void *arg)
	+{
	+ struct boundary_test_data data = (struct boundary_test_data )arg;
	+ char read_buf[8 * 1024];
	+ size_t read_bytes;
	+
	+ while (data->done == 0) {
	+ /* Read small chunks to trigger frequent swaps */
	+ read_bytes = eventlog_read_into_buf(data->subscriber, read_buf,
	+ 512, 0);
	+ if (read_bytes > 0) {
	+ /* Immediately read again to trigger swap */
	+ read_bytes = eventlog_read_into_buf(data->subscriber,
	+ read_buf, sizeof(read_buf), 0);
	+ }
	+ kern_yield(PRI_UNCHANGED);
	+ }
	+
	+ atomic_store_rel_int(&data->reader_exited, 1);
	+ wakeup(&data->reader_exited);
	+ kthread_exit();
	+}
	+
	+/*
	+ * Stress test for buffer boundary conditions.
	+ * Tests rapid writes and reads that fill buffers exactly to boundaries.
	+ */
	+KTEST_FUNC(subscriber_buffer_boundary_stress)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *subscriber;
	+ struct boundary_test_data test_data;
	+ struct thread writer_thread, reader_thread;
	+ int error;
	+ uint64_t initial_written;
	+
	+ KTEST_LOG(ctx, "Testing buffer boundary stress conditions");
	+
	+ provider = test_create_provider("test_bbstress", NULL, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+
	+ /* Use 128KB buffer to trigger boundary conditions (above 64KB min) */
	+ subscriber = eventlog_subscriber_create_device(128 * 1024);
	+ KTEST_NEQUAL(subscriber, NULL);
	+
	+ if (eventlog_subscriber_add_subscription(subscriber, "test_bbstress",
	+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, 0) != 0) {
	+ eventlog_subscriber_destroy(subscriber);
	+ eventlog_provider_destroy(provider);
	+ return (EINVAL);
	+ }
	+
	+ session = eventlog_session_create(provider, 0, true, NULL, 0);
	+ KTEST_NEQUAL(session, NULL);
	+
	+ bzero(&test_data, sizeof(test_data));
	+ test_data.session = session;
	+ test_data.subscriber = subscriber;
	+ test_data.done = 0;
	+ test_data.reader_exited = 0;
	+ test_data.events_written = 0;
	+ mtx_init(&test_data.lock, "boundary_test", NULL, MTX_DEF);
	+
	+ /* Create writer thread */
	+ error = kthread_add(boundary_writer_thread, &test_data, NULL,
	+ &writer_thread, 0, 0, "evtlog_boundary_writer");
	+ KTEST_EQUAL(error, 0);
	+
	+ /* Create reader thread */
	+ error = kthread_add(boundary_reader_thread, &test_data, NULL,
	+ &reader_thread, 0, 0, "evtlog_boundary_reader");
	+ KTEST_EQUAL(error, 0);
	+
	+ /* Run for a period to exercise boundary conditions */
	+ tsleep(&test_data, 0, "boundary_run", hz * 2); /* 2 seconds */
	+
	+ initial_written = atomic_load_acq_64(&test_data.events_written);
	+ KTEST_LOG(ctx, "Wrote %llu events during boundary stress test",
	+ (unsigned long long)initial_written);
	+ KTEST_VERIFY(initial_written > 0);
	+
	+ /*
	+ * Stop threads - wake reader if blocked, wait for it to exit (single
	+ * reader)
	+ */
	+ test_data.done = 1;
	+ wakeup(subscriber);
	+ while (atomic_load_acq_int(&test_data.reader_exited) == 0)
	+ tsleep(&test_data.reader_exited, 0, "evtlog_rdwait", hz / 10);
	+
	+ /* Drain remaining events (reader has exited, single reader) */
	+ {
	+ char drain_buf[8 * 1024];
	+ size_t drain_read;
	+
	+ do {
	+ drain_read = eventlog_read_into_buf(subscriber,
	+ drain_buf, sizeof(drain_buf), 0);
	+ } while (drain_read > 0);
	+ }
	+
	+ mtx_destroy(&test_data.lock);
	+ eventlog_session_destroy(session);
	+ eventlog_subscriber_destroy(subscriber);
	+ eventlog_provider_destroy(provider);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Stress test that fills buffers exactly to capacity.
	+ * Tests edge cases where write_pos approaches buffer_size.
	+ */
	+KTEST_FUNC(subscriber_buffer_fill_to_capacity)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *subscriber;
	+ uint32_t test_id = 0x5000;
	+ uint32_t test_data = 0x12345678;
	+ char *read_buf;
	+ size_t read_buf_size = 64 * 1024;
	+ ssize_t read_bytes;
	+ struct eventlog_stats stats;
	+ size_t buffer_size_per_cpu = 128 * 1024;
	+ size_t create_event_size;
	+ size_t event_size;
	+ size_t max_events;
	+ size_t fill_count;
	+ int i;
	+
	+ KTEST_LOG(ctx, "Testing buffer fill to exact capacity");
	+
	+ read_buf = malloc(read_buf_size, M_EVENTLOG_TEST, M_WAITOK);
	+ KTEST_NEQUAL(read_buf, NULL);
	+
	+ provider = test_create_provider("test_bfill", NULL, NULL);
	+ if (provider == NULL) {
	+ free(read_buf, M_EVENTLOG_TEST);
	+ return (EINVAL);
	+ }
	+
	+ subscriber = eventlog_subscriber_create_device(buffer_size_per_cpu);
	+ if (subscriber == NULL) {
	+ free(read_buf, M_EVENTLOG_TEST);
	+ eventlog_provider_destroy(provider);
	+ return (EINVAL);
	+ }
	+
	+ if (eventlog_subscriber_add_subscription(subscriber, "test_bfill",
	+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, 0) != 0) {
	+ free(read_buf, M_EVENTLOG_TEST);
	+ eventlog_subscriber_destroy(subscriber);
	+ eventlog_provider_destroy(provider);
	+ return (EINVAL);
	+ }
	+
	+ session = eventlog_session_create(provider, 0, true, NULL, 0);
	+ if (session == NULL) {
	+ free(read_buf, M_EVENTLOG_TEST);
	+ eventlog_subscriber_destroy(subscriber);
	+ eventlog_provider_destroy(provider);
	+ return (EINVAL);
	+ }
	+
	+ /*
	+ * SESSION_CREATE is header-only (no payload), user events carry a
	+ * uint32_t payload. Compute how many user events fit after the
	+ * session_create event, leaving less than one event of slack.
	+ */
	+ create_event_size = sizeof(struct eventlog_event_header);
	+ event_size = sizeof(struct eventlog_event_header) + sizeof(uint32_t);
	+ max_events = (buffer_size_per_cpu - create_event_size) / event_size;
	+
	+ KTEST_LOG(ctx, "Event size: %zu bytes, create size: %zu bytes, "
	+ "buffer size: %zu bytes, max user events: %zu",
	+ event_size, create_event_size, buffer_size_per_cpu, max_events);
	+
	+ /*
	+ * Fill buffer: session_create already wrote 1 event, then max_events
	+ * user events to leave less than event_size bytes of slack.
	+ */
	+ for (i = 0; i < (int)max_events; i++) {
	+ eventlog_event_write(session, test_id + i, EVENTLOG_LEVEL_INFO,
	+ 0xFFFFFFFF, &test_data, sizeof(test_data));
	+ }
	+
	+ /* Write one more - triggers proactive swap (SWAP_ALLOWED) */
	+ eventlog_event_write(session, test_id + max_events,
	+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF, &test_data, sizeof(test_data));
	+
	+ eventlog_subscriber_get_stats(subscriber, &stats);
	+ KTEST_EQUAL(stats.dropped_events, 0);
	+ KTEST_LOG(ctx,
	+ "After first overflow (proactive swap): dropped %llu events",
	+ (unsigned long long)stats.dropped_events);
	+
	+ /*
	+ * Now fill the second buffer completely (1 event already there from
	+ * overflow). Second buffer has no session_create, so it holds
	+ * max_events+1 user events total (since buffer_size / event_size >
	+ * max_events when create_event_size < event_size). But 1 is already
	+ * there, so write max_events more.
	+ */
	+ fill_count = buffer_size_per_cpu / event_size - 1;
	+ for (i = 0; i < (int)fill_count; i++) {
	+ eventlog_event_write(session, test_id + 10000 + i,
	+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF, &test_data,
	+ sizeof(test_data));
	+ }
	+
	+ /* Write one more - SWAP_ALLOWED cleared (reader idle), so dropped */
	+ eventlog_event_write(session, test_id + 20000, EVENTLOG_LEVEL_INFO,
	+ 0xFFFFFFFF, &test_data, sizeof(test_data));
	+
	+ eventlog_subscriber_get_stats(subscriber, &stats);
	+ KTEST_EQUAL(stats.dropped_events, 1);
	+ KTEST_LOG(ctx, "After second overflow (no swap): dropped %llu events",
	+ (unsigned long long)stats.dropped_events);
	+
	+ /* Read all events from the reader buffer (filled by proactive swap) */
	+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
	+ read_buf_size, 0);
	+ KTEST_VERIFY(read_bytes > 0);
	+
	+ /* Read again to get events from the second buffer */
	+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
	+ read_buf_size, 0);
	+ KTEST_VERIFY(read_bytes > 0);
	+
	+ eventlog_session_destroy(session);
	+ eventlog_subscriber_destroy(subscriber);
	+ eventlog_provider_destroy(provider);
	+ free(read_buf, M_EVENTLOG_TEST);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Stress test with rapid buffer swaps.
	+ * Writes events, reads partially, writes more, reads again - rapid swapping.
	+ */
	+KTEST_FUNC(subscriber_rapid_swap_stress)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *subscriber;
	+ uint32_t test_id = 0x6000;
	+ uint32_t test_data[10];
	+ char *read_buf;
	+ size_t read_buf_size = 64 * 1024;
	+ ssize_t read_bytes;
	+ int i, j;
	+
	+ KTEST_LOG(ctx, "Testing rapid buffer swap stress");
	+
	+ read_buf = malloc(read_buf_size, M_EVENTLOG_TEST, M_WAITOK);
	+ KTEST_NEQUAL(read_buf, NULL);
	+
	+ provider = test_create_provider("test_rswap", NULL, NULL);
	+ if (provider == NULL) {
	+ free(read_buf, M_EVENTLOG_TEST);
	+ return (EINVAL);
	+ }
	+
	+ subscriber = eventlog_subscriber_create_device(128 * 1024);
	+ if (subscriber == NULL) {
	+ free(read_buf, M_EVENTLOG_TEST);
	+ eventlog_provider_destroy(provider);
	+ return (EINVAL);
	+ }
	+
	+ if (eventlog_subscriber_add_subscription(subscriber, "test_rswap",
	+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, 0) != 0) {
	+ free(read_buf, M_EVENTLOG_TEST);
	+ eventlog_subscriber_destroy(subscriber);
	+ eventlog_provider_destroy(provider);
	+ return (EINVAL);
	+ }
	+
	+ session = eventlog_session_create(provider, 0, true, NULL, 0);
	+ if (session == NULL) {
	+ free(read_buf, M_EVENTLOG_TEST);
	+ eventlog_subscriber_destroy(subscriber);
	+ eventlog_provider_destroy(provider);
	+ return (EINVAL);
	+ }
	+
	+ for (i = 0; i < 10; i++)
	+ test_data[i] = i;
	+
	+ /* Rapid cycle: write, read partially, write more, read again */
	+ for (j = 0; j < 50; j++) {
	+ /* Write a batch */
	+ for (i = 0; i < 30; i++) {
	+ eventlog_event_write(session, test_id++,
	+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF, test_data,
	+ sizeof(test_data));
	+ }
	+
	+ /* Read a small chunk to trigger swap */
	+ read_bytes = eventlog_read_into_buf(subscriber, read_buf, 1024,
	+ 0);
	+
	+ /* Write more while reader buffer is being drained */
	+ for (i = 0; i < 20; i++) {
	+ eventlog_event_write(session, test_id++,
	+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF, test_data,
	+ sizeof(test_data));
	+ }
	+
	+ /* Read remaining to drain reader buffer */
	+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
	+ read_buf_size, 0);
	+ }
	+
	+ /* Final drain */
	+ do {
	+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
	+ read_buf_size, 0);
	+ } while (read_bytes > 0);
	+
	+ eventlog_session_destroy(session);
	+ eventlog_subscriber_destroy(subscriber);
	+ eventlog_provider_destroy(provider);
	+ free(read_buf, M_EVENTLOG_TEST);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Validates device subscriber buffer size validation.
	+ * Buffer size must be between EVENTLOG_BUFFER_SIZE_MIN and
	+ * EVENTLOG_BUFFER_SIZE_MAX inclusive.
	+ */
	+KTEST_FUNC(subscriber_create_device_invalid_size)
	+{
	+ struct eventlog_subscriber *subscriber;
	+
	+ KTEST_LOG(ctx, "Testing device subscriber buffer size validation");
	+
	+ /* Too small */
	+ subscriber = eventlog_subscriber_create_device(
	+ EVENTLOG_BUFFER_SIZE_MIN - 1);
	+ KTEST_EQUAL(subscriber, NULL);
	+
	+ subscriber = eventlog_subscriber_create_device(0);
	+ KTEST_EQUAL(subscriber, NULL);
	+
	+ /* Too large */
	+ subscriber = eventlog_subscriber_create_device(
	+ EVENTLOG_BUFFER_SIZE_MAX + 1);
	+ KTEST_EQUAL(subscriber, NULL);
	+
	+ /* Valid boundaries */
	+ subscriber = eventlog_subscriber_create_device(
	+ EVENTLOG_BUFFER_SIZE_MIN);
	+ KTEST_NEQUAL(subscriber, NULL);
	+ eventlog_subscriber_destroy(subscriber);
	+
	+ subscriber = eventlog_subscriber_create_device(
	+ EVENTLOG_BUFFER_SIZE_MAX);
	+ KTEST_NEQUAL(subscriber, NULL);
	+ eventlog_subscriber_destroy(subscriber);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Validates that adding a subscription for a non-existent provider returns
	+ * ENOENT.
	+ */
	+KTEST_FUNC(subscriber_add_subscription_nonexistent_provider)
	+{
	+ struct eventlog_subscriber *subscriber;
	+ int error;
	+
	+ KTEST_LOG(ctx, "Testing subscription to non-existent provider");
	+
	+ subscriber = eventlog_subscriber_create_device(
	+ EVENTLOG_SUBSCRIBER_BUFFER_SIZE_DEFAULT);
	+ KTEST_NEQUAL(subscriber, NULL);
	+
	+ error = eventlog_subscriber_add_subscription(subscriber,
	+ "nonexistent_provider_xyz", EVENTLOG_LEVEL_INFO, 0xFFFFFFFF, 0);
	+ KTEST_EQUAL(error, ENOENT);
	+
	+ eventlog_subscriber_destroy(subscriber);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Validates eventlog_subscriber_read error paths: EOPNOTSUPP and EAGAIN.
	+ */
	+KTEST_FUNC(subscriber_read_error_paths)
	+{
	+ struct eventlog_subscriber *subscriber;
	+ struct uio uio;
	+ struct iovec iov[2];
	+ int error;
	+
	+ KTEST_LOG(ctx, "Testing subscriber read error paths");
	+
	+ subscriber = eventlog_subscriber_create_device(
	+ EVENTLOG_SUBSCRIBER_BUFFER_SIZE_DEFAULT);
	+ KTEST_NEQUAL(subscriber, NULL);
	+
	+ /* EOPNOTSUPP: multiple iovecs not supported */
	+ iov[0].iov_base = malloc(1024, M_EVENTLOG_TEST, M_WAITOK);
	+ iov[0].iov_len = 512;
	+ iov[1].iov_base = (char *)iov[0].iov_base + 512;
	+ iov[1].iov_len = 512;
	+ uio.uio_iov = iov;
	+ uio.uio_iovcnt = 2;
	+ uio.uio_offset = 0;
	+ uio.uio_resid = 1024;
	+ uio.uio_segflg = UIO_SYSSPACE;
	+ uio.uio_rw = UIO_READ;
	+ uio.uio_td = curthread;
	+
	+ error = eventlog_subscriber_read(subscriber, &uio, 0);
	+ KTEST_EQUAL(error, EOPNOTSUPP);
	+
	+ free(iov[0].iov_base, M_EVENTLOG_TEST);
	+
	+ /* EOPNOTSUPP: zero resid */
	+ iov[0].iov_base = malloc(1024, M_EVENTLOG_TEST, M_WAITOK);
	+ iov[0].iov_len = 1024;
	+ uio.uio_iov = iov;
	+ uio.uio_iovcnt = 1;
	+ uio.uio_resid = 0;
	+
	+ error = eventlog_subscriber_read(subscriber, &uio, 0);
	+ KTEST_EQUAL(error, EOPNOTSUPP);
	+
	+ free(iov[0].iov_base, M_EVENTLOG_TEST);
	+
	+ /* EAGAIN: FNONBLOCK with no data */
	+ iov[0].iov_base = malloc(1024, M_EVENTLOG_TEST, M_WAITOK);
	+ iov[0].iov_len = 1024;
	+ uio.uio_iov = iov;
	+ uio.uio_iovcnt = 1;
	+ uio.uio_resid = 1024;
	+
	+ error = eventlog_subscriber_read(subscriber, &uio, FNONBLOCK);
	+ KTEST_EQUAL(error, EAGAIN);
	+
	+ free(iov[0].iov_base, M_EVENTLOG_TEST);
	+
	+ eventlog_subscriber_destroy(subscriber);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Validates that *_destroy with NULL pointer returns without crashing.
	+ */
	+KTEST_FUNC(null_pointer_destroy)
	+{
	+ KTEST_LOG(ctx, "Testing NULL pointer handling in destroy functions");
	+
	+ eventlog_provider_destroy(NULL);
	+ eventlog_session_destroy(NULL);
	+ eventlog_subscriber_destroy(NULL);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Validates that events are filtered by level and keywords.
	+ * Subscriber at INFO/0x1 should not receive VERBOSE events or events with
	+ * non-matching keywords.
	+ */
	+KTEST_FUNC(subscriber_level_keyword_filtering)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *subscriber;
	+ struct test_callback_data *callback_data;
	+ uint32_t test_id = 0x1111;
	+ uint32_t test_data = 0xdeadbeef;
	+
	+ KTEST_LOG(ctx, "Testing level and keyword filtering");
	+
	+ provider = test_create_provider("test_filter", NULL, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+
	+ /* Subscriber wants INFO level, keyword 0x1 only */
	+ subscriber = test_enable_provider_callback("test_filter",
	+ EVENTLOG_LEVEL_INFO, 0x1, &callback_data);
	+ KTEST_NEQUAL(subscriber, NULL);
	+
	+ session = eventlog_session_create(provider, 0, true, NULL, 0);
	+ KTEST_NEQUAL(session, NULL);
	+
	+ /* Event at INFO level with keyword 0x1 - should be received */
	+ eventlog_event_write(session, test_id, EVENTLOG_LEVEL_INFO, 0x1,
	+ &test_data, sizeof(test_data));
	+ {
	+ uint32_t ec;
	+ mtx_lock(&callback_data->lock);
	+ ec = atomic_load_acq_32(&callback_data->event_count);
	+ mtx_unlock(&callback_data->lock);
	+ KTEST_EQUAL(ec, 1);
	+ }
	+
	+ /* Event at VERBOSE level - filtered out (VERBOSE > INFO) */
	+ eventlog_event_write(session, test_id + 1, EVENTLOG_LEVEL_VERBOSE, 0x1,
	+ &test_data, sizeof(test_data));
	+ {
	+ uint32_t ec;
	+ mtx_lock(&callback_data->lock);
	+ ec = atomic_load_acq_32(&callback_data->event_count);
	+ mtx_unlock(&callback_data->lock);
	+ KTEST_EQUAL(ec, 1);
	+ }
	+
	+ /* Event at INFO with keyword 0x2 only - filtered out (no key match) */
	+ eventlog_event_write(session, test_id + 2, EVENTLOG_LEVEL_INFO, 0x2,
	+ &test_data, sizeof(test_data));
	+ {
	+ uint32_t ec;
	+ mtx_lock(&callback_data->lock);
	+ ec = atomic_load_acq_32(&callback_data->event_count);
	+ mtx_unlock(&callback_data->lock);
	+ KTEST_EQUAL(ec, 1);
	+ }
	+
	+ /* Event at INFO with keywords 0x1 \| 0x2 - received (0x1 matches) */
	+ eventlog_event_write(session, test_id + 3, EVENTLOG_LEVEL_INFO, 0x3,
	+ &test_data, sizeof(test_data));
	+ {
	+ uint32_t ec;
	+ mtx_lock(&callback_data->lock);
	+ ec = atomic_load_acq_32(&callback_data->event_count);
	+ mtx_unlock(&callback_data->lock);
	+ KTEST_EQUAL(ec, 2);
	+ }
	+
	+ eventlog_session_destroy(session);
	+ eventlog_subscriber_destroy(subscriber);
	+ mtx_destroy(&callback_data->lock);
	+ free(callback_data, M_EVENTLOG_TEST);
	+ eventlog_provider_destroy(provider);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Validates that events exceeding UINT16_MAX are dropped silently.
	+ */
	+KTEST_FUNC(event_oversized_dropped)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *subscriber;
	+ struct test_callback_data *callback_data;
	+ uint8_t *large_payload;
	+ size_t oversized_len;
	+ uint32_t test_id = 0x9999;
	+
	+ KTEST_LOG(ctx, "Testing that oversized events are dropped");
	+
	+ /*
	+ * total_size = sizeof(eventlog_event_header) + payload.
	+ * Need total_size > UINT16_MAX (65535). Header is 32 bytes,
	+ * so payload must be > 65535 - 32 = 65503. Use 65504.
	+ */
	+ oversized_len = 65504;
	+
	+ provider = test_create_provider("test_oversize", NULL, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+
	+ subscriber = test_enable_provider_callback("test_oversize",
	+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, &callback_data);
	+ KTEST_NEQUAL(subscriber, NULL);
	+
	+ session = eventlog_session_create(provider, 0, true, NULL, 0);
	+ KTEST_NEQUAL(session, NULL);
	+
	+ large_payload = malloc(oversized_len, M_EVENTLOG_TEST,
	+ M_WAITOK \| M_ZERO);
	+ KTEST_NEQUAL(large_payload, NULL);
	+
	+ /* This event exceeds UINT16_MAX and should be dropped (no callback) */
	+ eventlog_event_write(session, test_id, EVENTLOG_LEVEL_INFO, 0xFFFFFFFF,
	+ large_payload, oversized_len);
	+
	+ /*
	+ * Read without holding lock: callback is never invoked (event dropped
	+ * before reaching subscribers). Holding the lock across KTEST_EQUAL
	+ * can panic when kyua runs tests in taskqueue context (KTEST_LOG may
	+ * sleep).
	+ */
	+ /* SESSION_CREATE only; oversized dropped */
	+ KTEST_EQUAL(atomic_load_acq_32(&callback_data->event_count), 1);
	+
	+ free(large_payload, M_EVENTLOG_TEST);
	+ eventlog_session_destroy(session);
	+ eventlog_subscriber_destroy(subscriber);
	+ mtx_destroy(&callback_data->lock);
	+ free(callback_data, M_EVENTLOG_TEST);
	+ eventlog_provider_destroy(provider);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Validates zero-length payload and empty session_id.
	+ */
	+KTEST_FUNC(event_edge_cases_payload_session)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *subscriber;
	+ struct test_callback_data *callback_data;
	+ uint32_t test_id = 0x7777;
	+
	+ KTEST_LOG(ctx, "Testing zero-length payload and empty session_id");
	+
	+ provider = test_create_provider("test_edge", NULL, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+
	+ subscriber = test_enable_provider_callback("test_edge",
	+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, &callback_data);
	+ KTEST_NEQUAL(subscriber, NULL);
	+
	+ /* Session with empty string session_id */
	+ session = eventlog_session_create(provider, 0, true, NULL, 0);
	+ KTEST_NEQUAL(session, NULL);
	+
	+ /* Event with zero-length payload (valid pointer, zero length) */
	+ eventlog_event_write(session, test_id, EVENTLOG_LEVEL_INFO, 0xFFFFFFFF,
	+ &test_id, 0);
	+
	+ /* Verify (read then unlock; KTEST_EQUAL may sleep) */
	+ {
	+ uint32_t ec, eid;
	+ size_t plen;
	+ mtx_lock(&callback_data->lock);
	+ ec = atomic_load_acq_32(&callback_data->event_count);
	+ eid = atomic_load_acq_32(&callback_data->last_event_id);
	+ plen = atomic_load_acq_long(&callback_data->last_payload_size);
	+ mtx_unlock(&callback_data->lock);
	+ /* SESSION_CREATE + 1 user event (zero-length payload) */
	+ KTEST_EQUAL(ec, 2);
	+ KTEST_EQUAL(eid, test_id);
	+ KTEST_EQUAL(plen, 0);
	+ }
	+
	+ eventlog_session_destroy(session);
	+ eventlog_subscriber_destroy(subscriber);
	+ mtx_destroy(&callback_data->lock);
	+ free(callback_data, M_EVENTLOG_TEST);
	+ eventlog_provider_destroy(provider);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Validates subscription update in place when re-subscribing to same provider.
	+ */
	+KTEST_FUNC(subscriber_subscription_update_in_place)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_subscriber *subscriber;
	+ int error;
	+
	+ KTEST_LOG(ctx, "Testing subscription update in place");
	+
	+ provider = test_create_provider("test_subupd", NULL, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+
	+ subscriber = eventlog_subscriber_create_device(
	+ EVENTLOG_SUBSCRIBER_BUFFER_SIZE_DEFAULT);
	+ KTEST_NEQUAL(subscriber, NULL);
	+
	+ /* First subscription */
	+ error = eventlog_subscriber_add_subscription(subscriber, "test_subupd",
	+ EVENTLOG_LEVEL_INFO, 0x1, 0);
	+ KTEST_EQUAL(error, 0);
	+ KTEST_EQUAL(eventlog_provider_get_level(provider), EVENTLOG_LEVEL_INFO);
	+ KTEST_EQUAL(eventlog_provider_get_keywords(provider), 0x1);
	+
	+ /* Re-subscribe to same provider: should update in place, not add */
	+ error = eventlog_subscriber_add_subscription(subscriber, "test_subupd",
	+ EVENTLOG_LEVEL_VERBOSE, 0x7, 0);
	+ KTEST_EQUAL(error, 0);
	+ KTEST_EQUAL(eventlog_provider_get_level(provider),
	+ EVENTLOG_LEVEL_VERBOSE);
	+ KTEST_EQUAL(eventlog_provider_get_keywords(provider), 0x7);
	+
	+ eventlog_subscriber_destroy(subscriber);
	+ eventlog_provider_destroy(provider);
	+
	+ return (0);
	+}
	+
	+KTEST_FUNC(schema_generated_macros)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *subscriber;
	+
	+ KTEST_LOG(ctx, "Testing schema-generated macros");
	+
	+ /* Create provider */
	+ provider = test_create_provider("test_schema", NULL, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+ session = eventlog_session_create(provider, 12345, true, NULL, 0);
	+ KTEST_NEQUAL(session, NULL);
	+
	+ /* Test 1: Verify _ENABLED macro returns false when no subscribers */
	+ KTEST_EQUAL(TEST_EVENTLOG_SIMPLE_EVENT_ENABLED(session), 0);
	+ KTEST_EQUAL(TEST_EVENTLOG_STATUS_EVENT_ENABLED(session), 0);
	+ KTEST_EQUAL(TEST_EVENTLOG_FLAGS_EVENT_ENABLED(session), 0);
	+ KTEST_EQUAL(TEST_EVENTLOG_COMPLEX_EVENT_ENABLED(session), 0);
	+
	+ /* Test 2: Create callback subscriber with BASIC keyword, INFO level */
	+ struct test_callback_data *callback_data;
	+ subscriber = test_enable_provider_callback("test_schema",
	+ EVENTLOG_LEVEL_INFO, TEST_EVENTLOG_KEYWORD_BASIC, &callback_data);
	+ KTEST_NEQUAL(subscriber, NULL);
	+ /* Provider enablement is auto-updated when subscription is added */
	+
	+ /* Verify _ENABLED macros work correctly */
	+ /* INFO level, BASIC keyword */
	+ KTEST_EQUAL(TEST_EVENTLOG_SIMPLE_EVENT_ENABLED(session), 1);
	+ /* INFO level, BASIC keyword */
	+ KTEST_EQUAL(TEST_EVENTLOG_STATUS_EVENT_ENABLED(session), 1);
	+ /* VERBOSE level, ADVANCED keyword */
	+ KTEST_EQUAL(TEST_EVENTLOG_FLAGS_EVENT_ENABLED(session), 0);
	+ /* WARN level, COMPLEX keyword */
	+ KTEST_EQUAL(TEST_EVENTLOG_COMPLEX_EVENT_ENABLED(session), 0);
	+
	+ /* Test 3: Use _LOG_ALWAYS macro (always logs regardless of enabled) */
	+ TEST_EVENTLOG_SIMPLE_EVENT_LOG_ALWAYS(session, 0x12345678);
	+ TEST_EVENTLOG_STATUS_EVENT_LOG_ALWAYS(session, 0xABCDEF00,
	+ TEST_EVENTLOG_TEST_STATUS_RUNNING);
	+
	+ /*
	+ * Verify events were received via callback (read then unlock;
	+ * KTEST_EQUAL may sleep)
	+ */
	+ {
	+ uint32_t ec;
	+ mtx_lock(&callback_data->lock);
	+ ec = atomic_load_acq_32(&callback_data->event_count);
	+ mtx_unlock(&callback_data->lock);
	+ /* Session created before subscriber; 2 LOG_ALWAYS events */
	+ KTEST_EQUAL(ec, 2);
	+ }
	+
	+ /* Reset callback data for next test */
	+ mtx_lock(&callback_data->lock);
	+ callback_data->event_count = 0;
	+ mtx_unlock(&callback_data->lock);
	+
	+ /* Test 4: Use _LOG macro (should check enablement first) */
	+ TEST_EVENTLOG_SIMPLE_EVENT_LOG(session, 0x87654321);
	+ TEST_EVENTLOG_STATUS_EVENT_LOG(session, 0xFEDCBA00,
	+ TEST_EVENTLOG_TEST_STATUS_SUCCESS);
	+ TEST_EVENTLOG_FLAGS_EVENT_LOG(session, 0x11111111,
	+ TEST_EVENTLOG_FLAG_FLAG_A \| TEST_EVENTLOG_FLAG_FLAG_B);
	+ TEST_EVENTLOG_COMPLEX_EVENT_LOG(session, 0x22222222, 0x33333333,
	+ TEST_EVENTLOG_TEST_STATUS_RUNNING, TEST_EVENTLOG_FLAG_FLAG_C, -42);
	+
	+ /*
	+ * Verify only enabled events were received (read then unlock;
	+ * KTEST_EQUAL may sleep)
	+ */
	+ {
	+ uint32_t ec;
	+ mtx_lock(&callback_data->lock);
	+ ec = atomic_load_acq_32(&callback_data->event_count);
	+ mtx_unlock(&callback_data->lock);
	+ /* Only SIMPLE and STATUS (session existed before subscriber) */
	+ KTEST_EQUAL(ec, 2);
	+ }
	+
	+ /* Test 5: Update subscriber to VERBOSE level with ADVANCED keyword */
	+ eventlog_subscriber_destroy(subscriber);
	+ mtx_destroy(&callback_data->lock);
	+ free(callback_data, M_EVENTLOG_TEST);
	+ subscriber = test_enable_provider_callback("test_schema",
	+ EVENTLOG_LEVEL_VERBOSE, TEST_EVENTLOG_KEYWORD_ADVANCED,
	+ &callback_data);
	+ KTEST_NEQUAL(subscriber, NULL);
	+ /* Provider enablement is auto-updated when subscription is added */
	+
	+ /* Verify FLAGS_EVENT is now enabled */
	+ KTEST_EQUAL(TEST_EVENTLOG_FLAGS_EVENT_ENABLED(session), 1);
	+ /* Still WARN level */
	+ KTEST_EQUAL(TEST_EVENTLOG_COMPLEX_EVENT_ENABLED(session), 0);
	+
	+ /* Test 6: Update subscriber to WARN level with COMPLEX keyword */
	+ eventlog_subscriber_destroy(subscriber);
	+ mtx_destroy(&callback_data->lock);
	+ free(callback_data, M_EVENTLOG_TEST);
	+ subscriber = test_enable_provider_callback("test_schema",
	+ EVENTLOG_LEVEL_WARN, TEST_EVENTLOG_KEYWORD_COMPLEX,
	+ &callback_data);
	+ KTEST_NEQUAL(subscriber, NULL);
	+ /* Provider enablement is auto-updated when subscription is added */
	+
	+ /* Verify COMPLEX_EVENT is now enabled */
	+ KTEST_EQUAL(TEST_EVENTLOG_COMPLEX_EVENT_ENABLED(session), 1);
	+
	+ /* Cleanup */
	+ eventlog_session_destroy(session);
	+ eventlog_subscriber_destroy(subscriber);
	+ mtx_destroy(&callback_data->lock);
	+ free(callback_data, M_EVENTLOG_TEST);
	+ eventlog_provider_destroy(provider);
	+ return (0);
	+}
	+
	+/*
	+ * Exercise the varlen trailing-array codegen for VARLEN_EVENT { id,
	+ * count, values:uint64_t[count:8] }: producer macro with partial,
	+ * clamped, and zero counts; accessor returning the trailing array.
	+ */
	+
	+struct varlen_cb_data {
	+ struct mtx lock;
	+ uint32_t events;
	+ uint32_t matched; /* events whose payload parsed correctly */
	+ /* events whose tail/head mismatched expectation */
	+ uint32_t mismatch;
	+ size_t last_payload_size;
	+ uint8_t last_count;
	+ uint64_t last_first_value;
	+ uint64_t last_last_value;
	+};
	+
	+static void
	+varlen_event_callback(const struct eventlog_event_header *hdr __unused,
	+ const char *provider_name __unused, uint8_t provider_name_len __unused,
	+ uint64_t session_id __unused,
	+ const struct iovec *iov, int iovcnt, size_t payload_size,
	+ void *callback_arg)
	+{
	+ struct varlen_cb_data *d = callback_arg;
	+ /*
	+ * The varlen producer emits a 2-segment iov: [head][tail].
	+ * Compact it into a stack buffer for the generated accessor.
	+ * Sized from the schema's declared max.
	+ */
	+ uint8_t buf[sizeof(struct test_eventlog_varlen_event) +
	+ TEST_EVENTLOG_VARLEN_EVENT_VALUES_MAX * sizeof(uint64_t)];
	+ const struct test_eventlog_varlen_event *evt;
	+ const uint64_t *vals;
	+ size_t off;
	+ int i;
	+
	+ atomic_add_32(&d->events, 1);
	+ if (payload_size < sizeof(*evt) \|\| payload_size > sizeof(buf))
	+ return;
	+ off = 0;
	+ for (i = 0; i < iovcnt; i++) {
	+ if (iov[i].iov_len > 0) {
	+ memcpy(buf + off, iov[i].iov_base, iov[i].iov_len);
	+ off += iov[i].iov_len;
	+ }
	+ }
	+ evt = (const struct test_eventlog_varlen_event *)buf;
	+
	+ vals = test_eventlog_varlen_event_values(evt, payload_size);
	+ d->last_payload_size = payload_size;
	+ d->last_count = evt->count;
	+ if (evt->count == 0) {
	+ /*
	+ * No trailing elements expected; accessor may still succeed
	+ * (payload_size == sizeof(head) + 0). Count as matched.
	+ */
	+ atomic_add_32(&d->matched, 1);
	+ return;
	+ }
	+ if (vals == NULL) {
	+ atomic_add_32(&d->mismatch, 1);
	+ return;
	+ }
	+ d->last_first_value = vals[0];
	+ d->last_last_value = vals[evt->count - 1];
	+ atomic_add_32(&d->matched, 1);
	+}
	+
	+KTEST_FUNC(schema_varlen_event)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *subscriber;
	+ struct varlen_cb_data cb;
	+ uint64_t payload[32];
	+ uint32_t i;
	+
	+ KTEST_LOG(ctx, "Testing varlen trailing-array schema events");
	+
	+ bzero(&cb, sizeof(cb));
	+ mtx_init(&cb.lock, "varlen_cb", NULL, MTX_DEF);
	+
	+ provider = test_create_provider("test_varlen", NULL, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+ session = eventlog_session_create(provider, 0x4711, true, NULL, 0);
	+ KTEST_NEQUAL(session, NULL);
	+
	+ subscriber = eventlog_subscriber_create_callback(varlen_event_callback,
	+ &cb);
	+ KTEST_NEQUAL(subscriber, NULL);
	+ KTEST_EQUAL(eventlog_subscriber_add_subscription(subscriber,
	+ "test_varlen", EVENTLOG_LEVEL_INFO, TEST_EVENTLOG_KEYWORD_BASIC, 0),
	+ 0);
	+
	+ KTEST_EQUAL(TEST_EVENTLOG_VARLEN_EVENT_ENABLED(session), 1);
	+
	+ /* Case 1: partial count (4 of 8). Accessor should return the tail. */
	+ for (i = 0; i < 4; i++)
	+ payload[i] = 0xAA00ULL + i;
	+ TEST_EVENTLOG_VARLEN_EVENT_LOG(session, 0xD00D, 4, payload);
	+
	+ /* Case 2: count > MAX. Producer macro must clamp to 8. */
	+ for (i = 0; i < 32; i++)
	+ payload[i] = 0xBB00ULL + i;
	+ TEST_EVENTLOG_VARLEN_EVENT_LOG(session, 0xBEEF, 32, payload);
	+
	+ /* Case 3: count == 0, values == NULL. No tail to copy. */
	+ TEST_EVENTLOG_VARLEN_EVENT_LOG(session, 0xCAFE, 0, NULL);
	+
	+ /*
	+ * Subscribers call us synchronously from the writer; no sleep needed.
	+ * The session was created BEFORE the subscriber attached, so no
	+ * SESSION_CREATE is delivered here -- we only see the 3 varlen
	+ * events we logged.
	+ */
	+ KTEST_EQUAL(atomic_load_acq_32(&cb.events), 3);
	+ KTEST_EQUAL(atomic_load_acq_32(&cb.matched), 3);
	+ KTEST_EQUAL(atomic_load_acq_32(&cb.mismatch), 0);
	+
	+ /*
	+ * Last event (count == 0) should have been delivered with a payload
	+ * equal to exactly sizeof(struct test_eventlog_varlen_event).
	+ */
	+ KTEST_EQUAL((int)cb.last_count, 0);
	+ KTEST_EQUAL((int)cb.last_payload_size,
	+ (int)sizeof(struct test_eventlog_varlen_event));
	+
	+ /* Spot-check accessor robustness against a short payload. */
	+ struct test_eventlog_varlen_event evt = { .id = 0, .count = 4 };
	+ KTEST_EQUAL(test_eventlog_varlen_event_values(&evt, sizeof(evt)),
	+ (const uint64_t *)NULL);
	+
	+ eventlog_subscriber_destroy(subscriber);
	+ eventlog_session_destroy(session);
	+ eventlog_provider_destroy(provider);
	+ mtx_destroy(&cb.lock);
	+ return (0);
	+}
	+
	+/*
	+ * Exercise eventlog_event_write_gather() directly: the iov is delivered
	+ * to the callback unchanged and segments concatenate in order.
	+ */
	+
	+/*
	+ * Exercises a multi-segment iov whose compacted size exceeds any
	+ * reasonable stack buffer in the framework. The iov path has no size
	+ * ceiling short of UINT16_MAX (wire-format event_length cap).
	+ */
	+#define GATHER_BIG_PAYLOAD_SIZE 4096
	+
	+struct gather_cb_data {
	+ uint32_t events;
	+ uint32_t matched;
	+ uint32_t mismatch;
	+ size_t last_payload_size;
	+ uint8_t last_first_byte;
	+ uint8_t last_last_byte;
	+};
	+
	+static void
	+gather_event_callback(const struct eventlog_event_header *hdr __unused,
	+ const char *provider_name __unused, uint8_t provider_name_len __unused,
	+ uint64_t session_id __unused,
	+ const struct iovec *iov, int iovcnt, size_t payload_size,
	+ void *callback_arg)
	+{
	+ struct gather_cb_data *d = callback_arg;
	+ const uint8_t *first_seg;
	+ const uint8_t *last_seg;
	+ int i;
	+
	+ atomic_add_32(&d->events, 1);
	+ d->last_payload_size = payload_size;
	+ if (payload_size == 0) {
	+ d->last_first_byte = 0;
	+ d->last_last_byte = 0;
	+ atomic_add_32(&d->matched, 1);
	+ return;
	+ }
	+ /*
	+ * Walk iov to pick out first-byte-of-first-nonempty-segment and
	+ * last-byte-of-last-nonempty-segment without compacting.
	+ */
	+ first_seg = NULL;
	+ last_seg = NULL;
	+ for (i = 0; i < iovcnt; i++) {
	+ if (iov[i].iov_len == 0)
	+ continue;
	+ if (first_seg == NULL)
	+ first_seg = iov[i].iov_base;
	+ last_seg = (const uint8_t *)iov[i].iov_base +
	+ iov[i].iov_len - 1;
	+ }
	+ if (first_seg == NULL \|\| last_seg == NULL) {
	+ atomic_add_32(&d->mismatch, 1);
	+ return;
	+ }
	+ d->last_first_byte = first_seg[0];
	+ d->last_last_byte = *last_seg;
	+ atomic_add_32(&d->matched, 1);
	+}
	+
	+KTEST_FUNC(event_write_gather)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *subscriber;
	+ struct gather_cb_data cb;
	+ struct iovec iov[3];
	+ uint8_t seg0[8], seg1[16];
	+ uint8_t *big;
	+ size_t i;
	+
	+ KTEST_LOG(ctx, "Testing eventlog_event_write_gather() scatter/gather");
	+
	+ bzero(&cb, sizeof(cb));
	+ provider = test_create_provider("test_gather", NULL, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+ session = eventlog_session_create(provider, 0x4712, true, NULL, 0);
	+ KTEST_NEQUAL(session, NULL);
	+ subscriber = eventlog_subscriber_create_callback(gather_event_callback,
	+ &cb);
	+ KTEST_NEQUAL(subscriber, NULL);
	+ KTEST_EQUAL(eventlog_subscriber_add_subscription(subscriber,
	+ "test_gather", EVENTLOG_LEVEL_INFO, TEST_EVENTLOG_KEYWORD_BASIC, 0),
	+ 0);
	+ eventlog_session_set_enabled(session, 1);
	+
	+ /* Case 1: iovcnt == 0, empty payload. */
	+ eventlog_event_write_gather(session, 0x100, EVENTLOG_LEVEL_INFO,
	+ TEST_EVENTLOG_KEYWORD_BASIC, NULL, 0);
	+
	+ /*
	+ * Case 2: iovcnt == 1, contiguous buffer. Callback fast path: no
	+ * compact copy, pointer equals iov[0].iov_base.
	+ */
	+ for (i = 0; i < sizeof(seg0); i++)
	+ seg0[i] = (uint8_t)(0x10 + i);
	+ iov[0].iov_base = seg0;
	+ iov[0].iov_len = sizeof(seg0);
	+ eventlog_event_write_gather(session, 0x101, EVENTLOG_LEVEL_INFO,
	+ TEST_EVENTLOG_KEYWORD_BASIC, iov, 1);
	+
	+ /*
	+ * Case 3: iovcnt == 2, small payload. Callback compact path runs on
	+ * the on-stack buffer; verify order is seg0 then seg1.
	+ */
	+ for (i = 0; i < sizeof(seg1); i++)
	+ seg1[i] = (uint8_t)(0xA0 + i);
	+ iov[0].iov_base = seg0;
	+ iov[0].iov_len = sizeof(seg0);
	+ iov[1].iov_base = seg1;
	+ iov[1].iov_len = sizeof(seg1);
	+ eventlog_event_write_gather(session, 0x102, EVENTLOG_LEVEL_INFO,
	+ TEST_EVENTLOG_KEYWORD_BASIC, iov, 2);
	+
	+ /*
	+ * Case 4: iovcnt == 3, large multi-segment payload. The framework
	+ * passes the iov through unchanged; the callback sees three
	+ * segments and reports the first byte of seg0 and the last byte
	+ * of seg1. Nothing is dropped and no allocation happens.
	+ */
	+ big = malloc(GATHER_BIG_PAYLOAD_SIZE, M_EVENTLOG_TEST, M_WAITOK);
	+ for (i = 0; i < GATHER_BIG_PAYLOAD_SIZE; i++)
	+ big[i] = (uint8_t)(i & 0xFF);
	+ iov[0].iov_base = seg0;
	+ iov[0].iov_len = sizeof(seg0); /* bytes 0x10..0x17 */
	+ iov[1].iov_base = big;
	+ iov[1].iov_len = GATHER_BIG_PAYLOAD_SIZE; /* 0x00..0xFF... */
	+ iov[2].iov_base = seg1;
	+ iov[2].iov_len = sizeof(seg1); /* bytes 0xA0..0xAF */
	+ eventlog_event_write_gather(session, 0x103, EVENTLOG_LEVEL_INFO,
	+ TEST_EVENTLOG_KEYWORD_BASIC, iov, 3);
	+
	+ /* Four events, all matched (no mismatch, nothing dropped). */
	+ KTEST_EQUAL(atomic_load_acq_32(&cb.events), 4);
	+ KTEST_EQUAL(atomic_load_acq_32(&cb.matched), 4);
	+ KTEST_EQUAL(atomic_load_acq_32(&cb.mismatch), 0);
	+
	+ {
	+ struct eventlog_stats stats;
	+ eventlog_subscriber_get_stats(subscriber, &stats);
	+ KTEST_EQUAL(stats.dropped_events, 0);
	+ }
	+
	+ /* Last event: first byte from seg0, last byte from seg1's end. */
	+ KTEST_EQUAL((int)cb.last_payload_size,
	+ (int)(sizeof(seg0) + GATHER_BIG_PAYLOAD_SIZE + sizeof(seg1)));
	+ KTEST_EQUAL((int)cb.last_first_byte, 0x10);
	+ KTEST_EQUAL((int)cb.last_last_byte,
	+ (int)(uint8_t)(0xA0 + sizeof(seg1) - 1));
	+
	+ free(big, M_EVENTLOG_TEST);
	+ eventlog_subscriber_destroy(subscriber);
	+ eventlog_session_destroy(session);
	+ eventlog_provider_destroy(provider);
	+ return (0);
	+}
	+
	+/* ===== Lock-free per-CPU buffer tests ===== */
	+
	+/* Thread data for multi-writer tests */
	+struct lockfree_writer_data {
	+ struct eventlog_session *session;
	+ /* Barrier: all threads wait until set */
	+ int *go;
	+ int done;
	+ uint32_t thread_idx;
	+ uint32_t num_events;
	+ uint32_t events_written;
	+};
	+
	+static void
	+lockfree_writer_thread(void *arg)
	+{
	+ struct lockfree_writer_data data = (struct lockfree_writer_data )arg;
	+ uint32_t event_data[2];
	+ uint32_t i;
	+
	+ /*
	+ * Sleep until all threads are ready. Using tsleep instead of
	+ * cpu_spinwait avoids deadlocking on systems with fewer CPUs
	+ * than writer threads (busy-spinning writers would monopolize
	+ * all CPUs, preventing the main thread from setting go).
	+ */
	+ while (atomic_load_acq_32((volatile uint32_t *)data->go) == 0)
	+ tsleep(data->go, 0, "lf_go", 1);
	+
	+ for (i = 0; i < data->num_events; i++) {
	+ event_data[0] = data->thread_idx;
	+ event_data[1] = i;
	+ eventlog_event_write(data->session,
	+ (data->thread_idx << 16) \| i,
	+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF,
	+ event_data, sizeof(event_data));
	+ data->events_written++;
	+ }
	+
	+ atomic_store_rel_32((volatile uint32_t *)&data->done, 1);
	+ wakeup(&data->done);
	+ kthread_exit();
	+}
	+
	+/*
	+ * Stress the lock-free write/commit path by having many writers
	+ * concurrently write to the same device subscriber. All writers start
	+ * simultaneously to maximize contention on per-CPU buffers.
	+ */
	+KTEST_FUNC(lockfree_many_concurrent_writers)
	+{
	+#define LF_NUM_WRITERS 8
	+#define LF_EVENTS_PER_WRITER 500
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *subscriber;
	+ struct lockfree_writer_data writers[LF_NUM_WRITERS];
	+ struct thread *threads[LF_NUM_WRITERS];
	+ int go = 0;
	+ struct eventlog_stats stats;
	+ char *read_buf;
	+ size_t read_buf_size = 256 * 1024;
	+ size_t total_read = 0;
	+ size_t read_bytes;
	+ int i, error;
	+
	+ KTEST_LOG(ctx,
	+ "Testing lock-free concurrent writers (%d threads, %d events each)",
	+ LF_NUM_WRITERS, LF_EVENTS_PER_WRITER);
	+
	+ read_buf = malloc(read_buf_size, M_EVENTLOG_TEST, M_WAITOK);
	+ KTEST_NEQUAL(read_buf, NULL);
	+
	+ provider = test_create_provider("test_lf_many", NULL, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+
	+ subscriber = eventlog_subscriber_create_device(256 * 1024);
	+ KTEST_NEQUAL(subscriber, NULL);
	+
	+ error = eventlog_subscriber_add_subscription(subscriber, "test_lf_many",
	+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, 0);
	+ KTEST_EQUAL(error, 0);
	+
	+ session = eventlog_session_create(provider, 0, true, NULL, 0);
	+ KTEST_NEQUAL(session, NULL);
	+
	+ /* Create all writer threads (they spin-wait on go) */
	+ for (i = 0; i < LF_NUM_WRITERS; i++) {
	+ bzero(&writers[i], sizeof(writers[i]));
	+ writers[i].session = session;
	+ writers[i].go = &go;
	+ writers[i].thread_idx = i;
	+ writers[i].num_events = LF_EVENTS_PER_WRITER;
	+ error = kthread_add(lockfree_writer_thread, &writers[i], NULL,
	+ &threads[i], 0, 0, "lf_writer_%d", i);
	+ KTEST_EQUAL(error, 0);
	+ }
	+
	+ /* Release all writers simultaneously */
	+ atomic_store_rel_32((volatile uint32_t *)&go, 1);
	+ wakeup(&go);
	+
	+ /* Wait for all writers to finish */
	+ for (i = 0; i < LF_NUM_WRITERS; i++) {
	+ while (atomic_load_acq_32(
	+ (volatile uint32_t *)&writers[i].done) == 0)
	+ tsleep(&writers[i].done, 0, "lf_wait", hz / 10);
	+ KTEST_EQUAL(writers[i].events_written, LF_EVENTS_PER_WRITER);
	+ }
	+
	+ /* Read all events and verify total count */
	+ do {
	+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
	+ read_buf_size, 0);
	+ total_read += read_bytes;
	+ } while (read_bytes > 0);
	+
	+ eventlog_subscriber_get_stats(subscriber, &stats);
	+ KTEST_LOG(ctx, "Total bytes read: %zu, dropped events: %llu",
	+ total_read, (unsigned long long)stats.dropped_events);
	+ KTEST_VERIFY(total_read > 0);
	+
	+ /*
	+ * With a 256KB buffer, some events may be dropped on small CPUs where
	+ * all threads hit the same per-CPU buffer. That's fine - the test
	+ * validates no crashes, no corruption (INVARIANTS checks), and that
	+ * written + dropped == total attempted.
	+ */
	+ /* +1 for SESSION_CREATE */
	+ uint64_t total_attempted =
	+ (uint64_t)LF_NUM_WRITERS * LF_EVENTS_PER_WRITER + 1;
	+ KTEST_LOG(ctx, "Total attempted: %llu, dropped: %llu",
	+ (unsigned long long)total_attempted,
	+ (unsigned long long)stats.dropped_events);
	+
	+ eventlog_session_destroy(session);
	+ eventlog_subscriber_destroy(subscriber);
	+ eventlog_provider_destroy(provider);
	+ free(read_buf, M_EVENTLOG_TEST);
	+
	+ return (0);
	+#undef LF_NUM_WRITERS
	+#undef LF_EVENTS_PER_WRITER
	+}
	+
	+/*
	+ * Stress the writer/swap contention path: many writers + a reader doing
	+ * rapid swaps. This exercises the commit CAS retry path when a reader
	+ * swap races with a writer's commit.
	+ */
	+struct lockfree_swap_writer_data {
	+ struct eventlog_session *session;
	+ int *stop;
	+ uint64_t events_written;
	+ int exited;
	+};
	+
	+static void
	+lockfree_swap_writer(void *arg)
	+{
	+ struct lockfree_swap_writer_data *data = arg;
	+ uint32_t payload = 0;
	+
	+ while (atomic_load_acq_32((volatile uint32_t *)data->stop) == 0) {
	+ eventlog_event_write(data->session, 0x1000 + (payload & 0xFF),
	+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF,
	+ &payload, sizeof(payload));
	+ atomic_add_64(&data->events_written, 1);
	+ payload++;
	+ kern_yield(PRI_UNCHANGED);
	+ }
	+
	+ atomic_store_rel_32((volatile uint32_t *)&data->exited, 1);
	+ wakeup(&data->exited);
	+ kthread_exit();
	+}
	+
	+static void
	+lockfree_stop_callout(void *arg)
	+{
	+ int *stop = arg;
	+
	+ atomic_store_rel_32((volatile uint32_t *)stop, 1);
	+ wakeup(stop);
	+}
	+
	+KTEST_FUNC(lockfree_writer_swap_contention)
	+{
	+#define LFSW_NUM_WRITERS 4
	+#define LFSW_RUN_SECONDS 3
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *subscriber;
	+ struct lockfree_swap_writer_data writers[LFSW_NUM_WRITERS];
	+ struct thread *threads[LFSW_NUM_WRITERS];
	+ int stop = 0;
	+ struct callout stop_timer;
	+ char *read_buf;
	+ size_t read_buf_size = 64 * 1024;
	+ size_t total_bytes_read = 0;
	+ size_t read_bytes;
	+ uint64_t swap_iterations = 0;
	+ struct eventlog_stats stats;
	+ int i, error;
	+
	+ KTEST_LOG(ctx,
	+ "Testing lock-free writer/swap contention (%d writers, %d seconds)",
	+ LFSW_NUM_WRITERS, LFSW_RUN_SECONDS);
	+
	+ read_buf = malloc(read_buf_size, M_EVENTLOG_TEST, M_WAITOK);
	+ KTEST_NEQUAL(read_buf, NULL);
	+
	+ provider = test_create_provider("test_lf_swap", NULL, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+
	+ /* 128KB buffer to trigger frequent swaps (above 64KB minimum) */
	+ subscriber = eventlog_subscriber_create_device(128 * 1024);
	+ KTEST_NEQUAL(subscriber, NULL);
	+
	+ error = eventlog_subscriber_add_subscription(subscriber, "test_lf_swap",
	+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, 0);
	+ KTEST_EQUAL(error, 0);
	+
	+ session = eventlog_session_create(provider, 0, true, NULL, 0);
	+ KTEST_NEQUAL(session, NULL);
	+
	+ /* Start writer threads */
	+ for (i = 0; i < LFSW_NUM_WRITERS; i++) {
	+ bzero(&writers[i], sizeof(writers[i]));
	+ writers[i].session = session;
	+ writers[i].stop = &stop;
	+ error = kthread_add(lockfree_swap_writer, &writers[i], NULL,
	+ &threads[i], 0, 0, "lfsw_writer_%d", i);
	+ KTEST_EQUAL(error, 0);
	+ }
	+
	+ /*
	+ * Use a callout to set stop from softclock context. On a 2-CPU system,
	+ * writers in tight loops can starve the main thread on the run queue,
	+ * preventing it from ever executing stop=1. The callout fires from
	+ * timer interrupt context, bypassing scheduler contention.
	+ */
	+ callout_init(&stop_timer, 1);
	+ callout_reset(&stop_timer, hz * LFSW_RUN_SECONDS,
	+ lockfree_stop_callout, &stop);
	+
	+ /* Reader loop: read rapidly to trigger swaps while writers active */
	+ while (atomic_load_acq_32((volatile uint32_t *)&stop) == 0) {
	+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
	+ read_buf_size, 0);
	+ if (read_bytes > 0) {
	+ total_bytes_read += read_bytes;
	+ swap_iterations++;
	+ }
	+ tsleep(&stop, 0, "lfsw_rd", 1);
	+ }
	+
	+ callout_drain(&stop_timer);
	+
	+ /* Wait for writers to exit */
	+ for (i = 0; i < LFSW_NUM_WRITERS; i++) {
	+ while (atomic_load_acq_32(
	+ (volatile uint32_t *)&writers[i].exited) == 0)
	+ tsleep(&writers[i].exited, 0, "lfsw_wait", hz / 10);
	+ }
	+
	+ /* Drain remaining */
	+ do {
	+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
	+ read_buf_size, 0);
	+ total_bytes_read += read_bytes;
	+ } while (read_bytes > 0);
	+
	+ eventlog_subscriber_get_stats(subscriber, &stats);
	+
	+ uint64_t total_written = 0;
	+ for (i = 0; i < LFSW_NUM_WRITERS; i++)
	+ total_written += writers[i].events_written;
	+
	+ KTEST_LOG(ctx, "Writers produced %llu events, reader did %llu swaps, "
	+ "read %zu bytes, dropped %llu",
	+ (unsigned long long)total_written,
	+ (unsigned long long)swap_iterations,
	+ total_bytes_read,
	+ (unsigned long long)stats.dropped_events);
	+
	+ KTEST_VERIFY(total_written > 0);
	+ KTEST_VERIFY(total_bytes_read > 0);
	+ KTEST_VERIFY(swap_iterations > 0);
	+
	+ eventlog_session_destroy(session);
	+ eventlog_subscriber_destroy(subscriber);
	+ eventlog_provider_destroy(provider);
	+ free(read_buf, M_EVENTLOG_TEST);
	+
	+ return (0);
	+#undef LFSW_NUM_WRITERS
	+#undef LFSW_RUN_SECONDS
	+}
	+
	+/*
	+ * Test buffer-full contention: tiny buffer + many writers to force the
	+ * buffer-full swap/drop path under contention. Verifies no events are
	+ * corrupted despite heavy drops.
	+ */
	+KTEST_FUNC(lockfree_buffer_full_contention)
	+{
	+#define LFBF_NUM_WRITERS 4
	+#define LFBF_EVENTS_PER_WRITER 5000
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *subscriber;
	+ struct lockfree_writer_data writers[LFBF_NUM_WRITERS];
	+ struct thread *threads[LFBF_NUM_WRITERS];
	+ int go = 0;
	+ struct eventlog_stats stats;
	+ char *read_buf;
	+ size_t read_buf_size = 64 * 1024;
	+ size_t total_read = 0;
	+ size_t read_bytes;
	+ int i, error;
	+
	+ KTEST_LOG(ctx, "Testing lock-free buffer full contention (%d writers, "
	+ "%d events each, 128KB buffer)",
	+ LFBF_NUM_WRITERS, LFBF_EVENTS_PER_WRITER);
	+
	+ read_buf = malloc(read_buf_size, M_EVENTLOG_TEST, M_WAITOK);
	+ KTEST_NEQUAL(read_buf, NULL);
	+
	+ provider = test_create_provider("test_lf_bfull", NULL, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+
	+ /* 128KB buffer - will overflow quickly with concurrent writers */
	+ subscriber = eventlog_subscriber_create_device(128 * 1024);
	+ KTEST_NEQUAL(subscriber, NULL);
	+
	+ error = eventlog_subscriber_add_subscription(subscriber,
	+ "test_lf_bfull", EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, 0);
	+ KTEST_EQUAL(error, 0);
	+
	+ session = eventlog_session_create(provider, 0, true, NULL, 0);
	+ KTEST_NEQUAL(session, NULL);
	+
	+ /* Drain SESSION_CREATE */
	+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
	+ read_buf_size, 0);
	+
	+ /* Create all writer threads */
	+ for (i = 0; i < LFBF_NUM_WRITERS; i++) {
	+ bzero(&writers[i], sizeof(writers[i]));
	+ writers[i].session = session;
	+ writers[i].go = &go;
	+ writers[i].thread_idx = i;
	+ writers[i].num_events = LFBF_EVENTS_PER_WRITER;
	+ error = kthread_add(lockfree_writer_thread, &writers[i], NULL,
	+ &threads[i], 0, 0, "lfbf_writer_%d", i);
	+ KTEST_EQUAL(error, 0);
	+ }
	+
	+ /* Release all writers */
	+ atomic_store_rel_32((volatile uint32_t *)&go, 1);
	+ wakeup(&go);
	+
	+ /* Wait for completion */
	+ for (i = 0; i < LFBF_NUM_WRITERS; i++) {
	+ while (atomic_load_acq_32(
	+ (volatile uint32_t *)&writers[i].done) == 0)
	+ tsleep(&writers[i].done, 0, "lfbf_wait", hz / 10);
	+ }
	+
	+ /* Read whatever survived */
	+ do {
	+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
	+ read_buf_size, 0);
	+ total_read += read_bytes;
	+ } while (read_bytes > 0);
	+
	+ eventlog_subscriber_get_stats(subscriber, &stats);
	+
	+ uint64_t total_attempted =
	+ (uint64_t)LFBF_NUM_WRITERS * LFBF_EVENTS_PER_WRITER;
	+ KTEST_LOG(ctx, "Attempted %llu events, dropped %llu, read %zu bytes",
	+ (unsigned long long)total_attempted,
	+ (unsigned long long)stats.dropped_events,
	+ total_read);
	+
	+ /*
	+ * With a 128KB buffer and no reader draining during writes, almost all
	+ * events should be dropped. The key assertion is that we didn't crash
	+ * and INVARIANTS didn't fire.
	+ */
	+ KTEST_VERIFY(stats.dropped_events > 0);
	+
	+ /*
	+ * Validate that events that were read are well-formed by reading with
	+ * INVARIANTS buffer validation (already baked into eventlog_read).
	+ */
	+
	+ eventlog_session_destroy(session);
	+ eventlog_subscriber_destroy(subscriber);
	+ eventlog_provider_destroy(provider);
	+ free(read_buf, M_EVENTLOG_TEST);
	+
	+ return (0);
	+#undef LFBF_NUM_WRITERS
	+#undef LFBF_EVENTS_PER_WRITER
	+}
	+
	+/*
	+ * Test data integrity under concurrent lock-free writes: many writers +
	+ * concurrent reader, verify every event read back has valid structure
	+ * (correct event_length, recognizable payload pattern). This catches
	+ * torn writes or commit ordering bugs.
	+ */
	+struct lockfree_integrity_reader_data {
	+ struct eventlog_subscriber *subscriber;
	+ int *stop;
	+ uint64_t events_validated;
	+ uint64_t bytes_read;
	+ uint64_t corrupt_events;
	+ int exited;
	+};
	+
	+static void
	+lockfree_integrity_reader(void *arg)
	+{
	+ struct lockfree_integrity_reader_data *data = arg;
	+ char *read_buf;
	+ size_t read_buf_size = 64 * 1024;
	+
	+ read_buf = malloc(read_buf_size, M_EVENTLOG_TEST, M_WAITOK);
	+
	+ while (atomic_load_acq_32((volatile uint32_t *)data->stop) == 0) {
	+ size_t read_bytes = eventlog_read_into_buf(data->subscriber,
	+ read_buf, read_buf_size, 0);
	+ if (read_bytes == 0) {
	+ kern_yield(PRI_UNCHANGED);
	+ continue;
	+ }
	+
	+ data->bytes_read += read_bytes;
	+
	+ /* Walk each event and validate structure */
	+ size_t offset = 0;
	+ while (offset + sizeof(struct eventlog_event_header) <=
	+ read_bytes) {
	+ struct eventlog_event_header hdr;
	+ memcpy(&hdr, read_buf + offset, sizeof(hdr));
	+
	+ if (hdr.event_length <
	+ sizeof(struct eventlog_event_header) \|\|
	+ offset + hdr.event_length > read_bytes) {
	+ data->corrupt_events++;
	+ break;
	+ }
	+
	+ data->events_validated++;
	+ offset += hdr.event_length;
	+ }
	+ }
	+
	+ free(read_buf, M_EVENTLOG_TEST);
	+ atomic_store_rel_32((volatile uint32_t *)&data->exited, 1);
	+ wakeup(&data->exited);
	+ kthread_exit();
	+}
	+
	+KTEST_FUNC(lockfree_data_integrity_under_contention)
	+{
	+#define LFDI_NUM_WRITERS 4
	+#define LFDI_RUN_SECONDS 3
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *subscriber;
	+ struct lockfree_swap_writer_data writers[LFDI_NUM_WRITERS];
	+ struct lockfree_integrity_reader_data reader_data;
	+ struct thread *writer_threads[LFDI_NUM_WRITERS];
	+ struct thread *reader_thread;
	+ int stop = 0;
	+ struct callout stop_timer;
	+ struct eventlog_stats stats;
	+ int i, error;
	+
	+ KTEST_LOG(ctx,
	+ "Testing lock-free data integrity (%d writers + reader, "
	+ "%d seconds)",
	+ LFDI_NUM_WRITERS, LFDI_RUN_SECONDS);
	+
	+ provider = test_create_provider("test_lf_integ", NULL, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+
	+ /* 128KB buffer: holds some events but small enough to swap often */
	+ subscriber = eventlog_subscriber_create_device(128 * 1024);
	+ KTEST_NEQUAL(subscriber, NULL);
	+
	+ error = eventlog_subscriber_add_subscription(subscriber,
	+ "test_lf_integ", EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, 0);
	+ KTEST_EQUAL(error, 0);
	+
	+ session = eventlog_session_create(provider, 0, true, NULL, 0);
	+ KTEST_NEQUAL(session, NULL);
	+
	+ /* Start reader */
	+ bzero(&reader_data, sizeof(reader_data));
	+ reader_data.subscriber = subscriber;
	+ reader_data.stop = &stop;
	+ error = kthread_add(lockfree_integrity_reader, &reader_data, NULL,
	+ &reader_thread, 0, 0, "lfdi_reader");
	+ KTEST_EQUAL(error, 0);
	+
	+ /* Start writers */
	+ for (i = 0; i < LFDI_NUM_WRITERS; i++) {
	+ bzero(&writers[i], sizeof(writers[i]));
	+ writers[i].session = session;
	+ writers[i].stop = &stop;
	+ error = kthread_add(lockfree_swap_writer, &writers[i], NULL,
	+ &writer_threads[i], 0, 0, "lfdi_writer_%d", i);
	+ KTEST_EQUAL(error, 0);
	+ }
	+
	+ /*
	+ * Use a callout to set stop from softclock context. On a 2-CPU system,
	+ * writers in tight loops can starve the main thread on the run queue,
	+ * preventing it from ever executing stop=1. The callout fires from
	+ * timer interrupt context, bypassing scheduler contention.
	+ */
	+ callout_init(&stop_timer, 1);
	+ callout_reset(&stop_timer, hz * LFDI_RUN_SECONDS,
	+ lockfree_stop_callout, &stop);
	+
	+ while (atomic_load_acq_32((volatile uint32_t *)&stop) == 0)
	+ tsleep(&stop, 0, "lfdi_run", hz);
	+
	+ callout_drain(&stop_timer);
	+ wakeup(subscriber); /* Wake reader if sleeping */
	+
	+ for (i = 0; i < LFDI_NUM_WRITERS; i++) {
	+ while (atomic_load_acq_32(
	+ (volatile uint32_t *)&writers[i].exited) == 0)
	+ tsleep(&writers[i].exited, 0, "lfdi_ww", hz / 10);
	+ }
	+ while (atomic_load_acq_32(
	+ (volatile uint32_t *)&reader_data.exited) == 0)
	+ tsleep(&reader_data.exited, 0, "lfdi_rw", hz / 10);
	+
	+ eventlog_subscriber_get_stats(subscriber, &stats);
	+
	+ uint64_t total_written = 0;
	+ for (i = 0; i < LFDI_NUM_WRITERS; i++)
	+ total_written += writers[i].events_written;
	+
	+ KTEST_LOG(ctx, "Writers: %llu events. Reader: validated %llu events, "
	+ "%llu bytes, %llu corrupt. Dropped: %llu",
	+ (unsigned long long)total_written,
	+ (unsigned long long)reader_data.events_validated,
	+ (unsigned long long)reader_data.bytes_read,
	+ (unsigned long long)reader_data.corrupt_events,
	+ (unsigned long long)stats.dropped_events);
	+
	+ KTEST_VERIFY(total_written > 0);
	+ KTEST_VERIFY(reader_data.events_validated > 0);
	+ KTEST_EQUAL(reader_data.corrupt_events, 0);
	+
	+ eventlog_session_destroy(session);
	+ eventlog_subscriber_destroy(subscriber);
	+ eventlog_provider_destroy(provider);
	+
	+ return (0);
	+#undef LFDI_NUM_WRITERS
	+#undef LFDI_RUN_SECONDS
	+}
	+
	+/*
	+ * Test reader-writer swap race: reader aggressively swaps buffers while
	+ * writers are mid-write. With a tiny buffer, the reader swaps frequently,
	+ * maximizing the chance the reader's swap CAS races with a writer's commit
	+ * CAS. The writer must detect the swap (active buffer changed) and redo
	+ * the write to the correct buffer. Validates no panics (MPASS), no data
	+ * corruption, and all events are properly readable.
	+ */
	+KTEST_FUNC(lockfree_reader_writer_swap_race)
	+{
	+#define LFRW_NUM_WRITERS 4
	+#define LFRW_RUN_SECONDS 3
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *subscriber;
	+ struct lockfree_swap_writer_data writers[LFRW_NUM_WRITERS];
	+ struct thread *threads[LFRW_NUM_WRITERS];
	+ int stop = 0;
	+ struct callout stop_timer;
	+ char *read_buf;
	+ size_t read_buf_size = 4096;
	+ size_t total_bytes_read = 0;
	+ size_t read_bytes;
	+ uint64_t read_iterations = 0;
	+ struct eventlog_stats stats;
	+ int i, error;
	+
	+ KTEST_LOG(ctx,
	+ "Testing reader-writer swap race (%d writers, %d seconds, "
	+ "128KB buffer)", LFRW_NUM_WRITERS, LFRW_RUN_SECONDS);
	+
	+ read_buf = malloc(read_buf_size, M_EVENTLOG_TEST, M_WAITOK);
	+ KTEST_NEQUAL(read_buf, NULL);
	+
	+ provider = test_create_provider("test_lf_race", NULL, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+
	+ /* 128KB buffer: forces frequent swaps, maximizing race window */
	+ subscriber = eventlog_subscriber_create_device(128 * 1024);
	+ KTEST_NEQUAL(subscriber, NULL);
	+
	+ error = eventlog_subscriber_add_subscription(subscriber, "test_lf_race",
	+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, 0);
	+ KTEST_EQUAL(error, 0);
	+
	+ session = eventlog_session_create(provider, 0, true, NULL, 0);
	+ KTEST_NEQUAL(session, NULL);
	+
	+ for (i = 0; i < LFRW_NUM_WRITERS; i++) {
	+ bzero(&writers[i], sizeof(writers[i]));
	+ writers[i].session = session;
	+ writers[i].stop = &stop;
	+ error = kthread_add(lockfree_swap_writer, &writers[i], NULL,
	+ &threads[i], 0, 0, "lfrw_writer_%d", i);
	+ KTEST_EQUAL(error, 0);
	+ }
	+
	+ KTEST_LOG(ctx, "checkpoint: writers started, arming stop callout");
	+
	+ callout_init(&stop_timer, 1);
	+ callout_reset(&stop_timer, hz * LFRW_RUN_SECONDS,
	+ lockfree_stop_callout, &stop);
	+
	+ KTEST_LOG(ctx, "checkpoint: entering reader loop");
	+
	+ /*
	+ * Reader loop: read as fast as possible (no tsleep) to maximize
	+ * the chance of swapping while a writer is mid-commit.
	+ */
	+ while (atomic_load_acq_32((volatile uint32_t *)&stop) == 0) {
	+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
	+ read_buf_size, FNONBLOCK);
	+ if (read_bytes > 0) {
	+ total_bytes_read += read_bytes;
	+ read_iterations++;
	+ }
	+ }
	+
	+ KTEST_LOG(ctx,
	+ "checkpoint: reader loop exited (iters=%llu, bytes=%zu); "
	+ "draining callout",
	+ (unsigned long long)read_iterations, total_bytes_read);
	+
	+ callout_drain(&stop_timer);
	+
	+ KTEST_LOG(ctx, "checkpoint: callout drained, waiting for writers");
	+
	+ for (i = 0; i < LFRW_NUM_WRITERS; i++) {
	+ while (atomic_load_acq_32(
	+ (volatile uint32_t *)&writers[i].exited) == 0)
	+ tsleep(&writers[i].exited, 0, "lfrw_ww", hz / 10);
	+ KTEST_LOG(ctx, "checkpoint: writer %d exited", i);
	+ }
	+
	+ KTEST_LOG(ctx, "checkpoint: all writers exited, draining buffers");
	+
	+ /* Drain remaining */
	+ do {
	+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
	+ read_buf_size, 0);
	+ total_bytes_read += read_bytes;
	+ } while (read_bytes > 0);
	+
	+ KTEST_LOG(ctx, "checkpoint: drain complete, gathering stats");
	+
	+ eventlog_subscriber_get_stats(subscriber, &stats);
	+
	+ uint64_t total_written = 0;
	+ for (i = 0; i < LFRW_NUM_WRITERS; i++)
	+ total_written += writers[i].events_written;
	+
	+ KTEST_LOG(ctx, "Writers: %llu events. Reader: %llu reads, %zu bytes. "
	+ "Dropped: %llu",
	+ (unsigned long long)total_written,
	+ (unsigned long long)read_iterations,
	+ total_bytes_read,
	+ (unsigned long long)stats.dropped_events);
	+
	+ KTEST_VERIFY(total_written > 0);
	+ KTEST_VERIFY(total_bytes_read > 0);
	+ KTEST_VERIFY(read_iterations > 0);
	+
	+ KTEST_LOG(ctx, "checkpoint: tearing down session/subscriber/provider");
	+
	+ eventlog_session_destroy(session);
	+ eventlog_subscriber_destroy(subscriber);
	+ eventlog_provider_destroy(provider);
	+ free(read_buf, M_EVENTLOG_TEST);
	+
	+ KTEST_LOG(ctx, "checkpoint: teardown complete");
	+
	+ return (0);
	+#undef LFRW_NUM_WRITERS
	+#undef LFRW_RUN_SECONDS
	+}
	+
	+/*
	+ * Test: timestamp epoch boundary defers future-timestamped events.
	+ * Writes events with known timestamps, some well in the past and one far
	+ * in the future. Verifies only past events are delivered and the future
	+ * event is deferred.
	+ */
	+KTEST_FUNC(timestamp_epoch_boundary)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *subscriber;
	+ char read_buf[8 * 1024];
	+ size_t read_bytes;
	+ uint32_t payload;
	+ int i, event_count;
	+
	+ KTEST_LOG(ctx, "Testing timestamp epoch boundary deferral");
	+
	+ provider = test_create_provider("test_ts_epoch", NULL, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+ subscriber = test_enable_provider_device("test_ts_epoch",
	+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF);
	+ KTEST_NEQUAL(subscriber, NULL);
	+ session = eventlog_session_create(provider, 0, true, NULL, 0);
	+ KTEST_NEQUAL(session, NULL);
	+
	+ /* Drain SESSION_CREATE */
	+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
	+ sizeof(read_buf), 0);
	+ KTEST_VERIFY(read_bytes > 0);
	+
	+ /* Write 5 events with timestamps well in the past (1-5 microseconds) */
	+ for (i = 0; i < 5; i++) {
	+ payload = (uint32_t)(i + 1);
	+ eventlog_event_write_at(session, 100 + i, EVENTLOG_LEVEL_INFO,
	+ 0xFFFFFFFF, &payload, sizeof(payload),
	+ (uint64_t)(i + 1));
	+ }
	+
	+ /* Write 1 event with a far-future timestamp */
	+ payload = 0xFFFF;
	+ eventlog_event_write_at(session, 200, EVENTLOG_LEVEL_INFO,
	+ 0xFFFFFFFF, &payload, sizeof(payload),
	+ UINT64_MAX - 1000);
	+
	+ /* Read: should get exactly the 5 past events */
	+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
	+ sizeof(read_buf), 0);
	+ KTEST_VERIFY(read_bytes > 0);
	+
	+ /* Count events and verify timestamps are all in the past */
	+ event_count = 0;
	+ {
	+ size_t offset = 0;
	+ while (offset + sizeof(struct eventlog_event_header) <=
	+ read_bytes) {
	+ struct eventlog_event_header hdr;
	+ memcpy(&hdr, read_buf + offset, sizeof(hdr));
	+ if (hdr.event_length <
	+ sizeof(struct eventlog_event_header) \|\|
	+ offset + hdr.event_length > read_bytes)
	+ break;
	+ KTEST_VERIFY(hdr.timestamp < UINT64_MAX - 1000);
	+ event_count++;
	+ offset += hdr.event_length;
	+ }
	+ }
	+ KTEST_EQUAL(event_count, 5);
	+
	+ /* Second read (non-blocking): future deferred, nothing readable */
	+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
	+ sizeof(read_buf), FNONBLOCK);
	+ KTEST_EQUAL(read_bytes, 0);
	+
	+ eventlog_session_destroy(session);
	+ eventlog_subscriber_destroy(subscriber);
	+ eventlog_provider_destroy(provider);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Test: normal events (real timestamps) are unaffected by epoch boundary.
	+ * Writes events with real binuptime timestamps and verifies all are delivered.
	+ */
	+KTEST_FUNC(timestamp_epoch_normal_delivery)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *subscriber;
	+ char read_buf[8 * 1024];
	+ size_t read_bytes;
	+ uint32_t payload;
	+ int i, event_count;
	+
	+ KTEST_LOG(ctx, "Testing that normal events pass epoch boundary");
	+
	+ provider = test_create_provider("test_ts_normal", NULL, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+ subscriber = test_enable_provider_device("test_ts_normal",
	+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF);
	+ KTEST_NEQUAL(subscriber, NULL);
	+ session = eventlog_session_create(provider, 0, true, NULL, 0);
	+ KTEST_NEQUAL(session, NULL);
	+
	+ /* Drain SESSION_CREATE */
	+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
	+ sizeof(read_buf), 0);
	+ KTEST_VERIFY(read_bytes > 0);
	+
	+ /* Write 10 events with real timestamps */
	+ for (i = 0; i < 10; i++) {
	+ payload = (uint32_t)(i + 1);
	+ eventlog_event_write(session, 100 + i, EVENTLOG_LEVEL_INFO,
	+ 0xFFFFFFFF, &payload, sizeof(payload));
	+ }
	+
	+ /* Read: should get all 10 events */
	+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
	+ sizeof(read_buf), 0);
	+ KTEST_VERIFY(read_bytes > 0);
	+
	+ event_count = 0;
	+ {
	+ size_t offset = 0;
	+ while (offset + sizeof(struct eventlog_event_header) <=
	+ read_bytes) {
	+ struct eventlog_event_header hdr;
	+ memcpy(&hdr, read_buf + offset, sizeof(hdr));
	+ if (hdr.event_length <
	+ sizeof(struct eventlog_event_header) \|\|
	+ offset + hdr.event_length > read_bytes)
	+ break;
	+ event_count++;
	+ offset += hdr.event_length;
	+ }
	+ }
	+ KTEST_EQUAL(event_count, 10);
	+
	+ /* Buffer should be empty now */
	+ read_bytes = eventlog_read_into_buf(subscriber, read_buf,
	+ sizeof(read_buf), FNONBLOCK);
	+ KTEST_EQUAL(read_bytes, 0);
	+
	+ eventlog_session_destroy(session);
	+ eventlog_subscriber_destroy(subscriber);
	+ eventlog_provider_destroy(provider);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Test: small uio buffer with epoch boundary requires multiple reads.
	+ * Uses a uio buffer that fits only 2 events per read. Writes past and
	+ * future events. Verifies past events are delivered across multiple reads
	+ * and future events are never delivered.
	+ */
	+KTEST_FUNC(timestamp_epoch_small_uio)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *subscriber;
	+ size_t read_bytes;
	+ uint32_t payload;
	+ int i, total_events;
	+
	+ KTEST_LOG(ctx, "Testing epoch boundary with small uio buffer");
	+
	+ provider = test_create_provider("test_ts_small_uio", NULL, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+ subscriber = test_enable_provider_device("test_ts_small_uio",
	+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF);
	+ KTEST_NEQUAL(subscriber, NULL);
	+ session = eventlog_session_create(provider, 0, true, NULL, 0);
	+ KTEST_NEQUAL(session, NULL);
	+
	+ /* Drain SESSION_CREATE with a large buffer */
	+ {
	+ char drain_buf[4096];
	+ read_bytes = eventlog_read_into_buf(subscriber, drain_buf,
	+ sizeof(drain_buf), 0);
	+ KTEST_VERIFY(read_bytes > 0);
	+ }
	+
	+ /* Write 6 events with past timestamps, then 2 with future */
	+ for (i = 0; i < 6; i++) {
	+ payload = (uint32_t)(i + 1);
	+ eventlog_event_write_at(session, 100 + i, EVENTLOG_LEVEL_INFO,
	+ 0xFFFFFFFF, &payload, sizeof(payload),
	+ (uint64_t)(1000 + i));
	+ }
	+ for (i = 0; i < 2; i++) {
	+ payload = (uint32_t)(100 + i);
	+ eventlog_event_write_at(session, 200 + i, EVENTLOG_LEVEL_INFO,
	+ 0xFFFFFFFF, &payload, sizeof(payload),
	+ UINT64_MAX - (uint64_t)(2000 - i));
	+ }
	+
	+ /* Read with a buffer that fits ~2 events at a time */
	+ total_events = 0;
	+ {
	+ char small_buf[2 * (sizeof(struct eventlog_event_header) +
	+ sizeof(uint32_t)) + 64];
	+
	+ for (i = 0; i < 10; i++) {
	+ size_t offset;
	+ read_bytes = eventlog_read_into_buf(subscriber,
	+ small_buf, sizeof(small_buf), FNONBLOCK);
	+ if (read_bytes == 0)
	+ break;
	+ offset = 0;
	+ while (offset + sizeof(struct eventlog_event_header) <=
	+ read_bytes) {
	+ struct eventlog_event_header hdr;
	+ memcpy(&hdr, small_buf + offset, sizeof(hdr));
	+ if (hdr.event_length <
	+ sizeof(struct eventlog_event_header) \|\|
	+ offset + hdr.event_length > read_bytes)
	+ break;
	+ KTEST_VERIFY(
	+ hdr.timestamp < UINT64_MAX - 10000);
	+ total_events++;
	+ offset += hdr.event_length;
	+ }
	+ }
	+ }
	+
	+ /* Should have read exactly the 6 past events */
	+ KTEST_EQUAL(total_events, 6);
	+
	+ eventlog_session_destroy(session);
	+ eventlog_subscriber_destroy(subscriber);
	+ eventlog_provider_destroy(provider);
	+
	+ return (0);
	+}
	+
	+/* Dump state test infrastructure */
	+static volatile uint32_t dump_callback_invocations;
	+static struct eventlog_session *dump_test_sessions[4];
	+static int dump_test_session_count;
	+
	+static void
	+test_dump_callback(struct eventlog_provider provider, void arg)
	+{
	+ int i;
	+
	+ atomic_add_int(&dump_callback_invocations, 1);
	+ for (i = 0; i < dump_test_session_count; i++) {
	+ if (dump_test_sessions[i] != NULL &&
	+ dump_test_sessions[i]->effective_level >=
	+ EVENTLOG_LEVEL_INFO) {
	+ uint32_t data = 0xdead0000 \| i;
	+ eventlog_event_write(dump_test_sessions[i], 0x100 + i,
	+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF,
	+ &data, sizeof(data));
	+ }
	+ }
	+}
	+
	+/*
	+ * Verify dump callback is invoked and events arrive at subscriber.
	+ */
	+KTEST_FUNC(dump_state_basic)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *subscriber;
	+ struct test_callback_data *callback_data;
	+
	+ KTEST_LOG(ctx, "Testing dump state basic functionality");
	+
	+ dump_callback_invocations = 0;
	+ dump_test_session_count = 1;
	+
	+ provider = test_create_provider("test_ds_basic", test_dump_callback,
	+ NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+
	+ session = eventlog_session_create(provider, 1, true, NULL, 0);
	+ KTEST_NEQUAL(session, NULL);
	+ dump_test_sessions[0] = session;
	+
	+ callback_data = malloc(sizeof(*callback_data), M_EVENTLOG_TEST,
	+ M_WAITOK \| M_ZERO);
	+ mtx_init(&callback_data->lock, "test_ds_basic", NULL, MTX_DEF);
	+ subscriber = eventlog_subscriber_create_callback(test_event_callback,
	+ callback_data);
	+ KTEST_NEQUAL(subscriber, NULL);
	+
	+ KTEST_EQUAL(eventlog_subscriber_add_subscription(subscriber,
	+ "test_ds_basic", EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF,
	+ EVENTLOG_SUBSCRIPTION_DUMP_STATE), 0);
	+
	+ /*
	+ * dump_state runs on a private taskqueue; drain before reading
	+ * the observation counters so we deterministically see the
	+ * post-dump state and not the in-flight state.
	+ */
	+ eventlog_subscriber_drain_dumps(subscriber);
	+
	+ KTEST_EQUAL(atomic_load_acq_32(&dump_callback_invocations), 1);
	+ /* SESSION_CREATE from session_create + 1 dump event */
	+ KTEST_VERIFY(atomic_load_acq_32(&callback_data->event_count) >= 1);
	+
	+ dump_test_sessions[0] = NULL;
	+ eventlog_session_destroy(session);
	+ eventlog_subscriber_destroy(subscriber);
	+ mtx_destroy(&callback_data->lock);
	+ free(callback_data, M_EVENTLOG_TEST);
	+ eventlog_provider_destroy(provider);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Verify dump events go only to the requesting subscriber, not others.
	+ */
	+KTEST_FUNC(dump_state_routing)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber sub1, sub2;
	+ struct test_callback_data cd1, cd2;
	+ uint32_t sub1_count_before;
	+
	+ KTEST_LOG(ctx, "Testing dump state routing to single subscriber");
	+
	+ dump_callback_invocations = 0;
	+ dump_test_session_count = 1;
	+
	+ provider = test_create_provider("test_ds_route", test_dump_callback,
	+ NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+
	+ session = eventlog_session_create(provider, 1, true, NULL, 0);
	+ KTEST_NEQUAL(session, NULL);
	+ dump_test_sessions[0] = session;
	+
	+ /*
	+ * sub1: subscribes first. Its own dump runs immediately and produces
	+ * one event; drain so the count we capture next is stable.
	+ */
	+ cd1 = malloc(sizeof(*cd1), M_EVENTLOG_TEST, M_WAITOK \| M_ZERO);
	+ mtx_init(&cd1->lock, "test_ds_route1", NULL, MTX_DEF);
	+ sub1 = eventlog_subscriber_create_callback(test_event_callback, cd1);
	+ KTEST_NEQUAL(sub1, NULL);
	+ KTEST_EQUAL(eventlog_subscriber_add_subscription(sub1, "test_ds_route",
	+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF,
	+ EVENTLOG_SUBSCRIPTION_DUMP_STATE), 0);
	+ eventlog_subscriber_drain_dumps(sub1);
	+
	+ sub1_count_before = atomic_load_acq_32(&cd1->event_count);
	+
	+ /*
	+ * sub2 subscribes second. Its dump must be routed only to sub2 --
	+ * sub1's count must not change.
	+ */
	+ cd2 = malloc(sizeof(*cd2), M_EVENTLOG_TEST, M_WAITOK \| M_ZERO);
	+ mtx_init(&cd2->lock, "test_ds_route2", NULL, MTX_DEF);
	+ sub2 = eventlog_subscriber_create_callback(test_event_callback, cd2);
	+ KTEST_NEQUAL(sub2, NULL);
	+ KTEST_EQUAL(eventlog_subscriber_add_subscription(sub2, "test_ds_route",
	+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF,
	+ EVENTLOG_SUBSCRIPTION_DUMP_STATE), 0);
	+ eventlog_subscriber_drain_dumps(sub2);
	+
	+ /* sub2 should have received the dump event */
	+ KTEST_VERIFY(atomic_load_acq_32(&cd2->event_count) >= 1);
	+ /* sub1 should NOT have received any additional events from the dump */
	+ KTEST_EQUAL(atomic_load_acq_32(&cd1->event_count), sub1_count_before);
	+
	+ dump_test_sessions[0] = NULL;
	+ eventlog_session_destroy(session);
	+ eventlog_subscriber_destroy(sub1);
	+ eventlog_subscriber_destroy(sub2);
	+ mtx_destroy(&cd1->lock);
	+ mtx_destroy(&cd2->lock);
	+ free(cd1, M_EVENTLOG_TEST);
	+ free(cd2, M_EVENTLOG_TEST);
	+ eventlog_provider_destroy(provider);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Verify DUMP_STATE with NULL callback is a graceful no-op.
	+ */
	+KTEST_FUNC(dump_state_no_callback)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *subscriber;
	+ struct test_callback_data *callback_data;
	+
	+ KTEST_LOG(ctx, "Testing dump state with no callback (graceful no-op)");
	+
	+ provider = test_create_provider("test_ds_nocb", NULL, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+
	+ session = eventlog_session_create(provider, 1, true, NULL, 0);
	+ KTEST_NEQUAL(session, NULL);
	+
	+ callback_data = malloc(sizeof(*callback_data), M_EVENTLOG_TEST,
	+ M_WAITOK \| M_ZERO);
	+ mtx_init(&callback_data->lock, "test_ds_nocb", NULL, MTX_DEF);
	+ subscriber = eventlog_subscriber_create_callback(test_event_callback,
	+ callback_data);
	+ KTEST_NEQUAL(subscriber, NULL);
	+
	+ /* Should succeed without crash even though no dump callback */
	+ KTEST_EQUAL(eventlog_subscriber_add_subscription(subscriber,
	+ "test_ds_nocb", EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF,
	+ EVENTLOG_SUBSCRIPTION_DUMP_STATE), 0);
	+
	+ /*
	+ * No dump task should have been enqueued because the provider has
	+ * no dump_callback. drain_dumps still has to be a no-op in that
	+ * case (dump_pending stays at 0); call it explicitly to pin that
	+ * contract.
	+ */
	+ eventlog_subscriber_drain_dumps(subscriber);
	+
	+ /* No dump events; only SESSION_CREATE may be counted */
	+ KTEST_VERIFY(atomic_load_acq_32(&callback_data->event_count) <= 1);
	+
	+ eventlog_session_destroy(session);
	+ eventlog_subscriber_destroy(subscriber);
	+ mtx_destroy(&callback_data->lock);
	+ free(callback_data, M_EVENTLOG_TEST);
	+ eventlog_provider_destroy(provider);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Captures curthread->td_vnet (as uintptr_t to keep this file free of any
	+ * struct-vnet dependency) observed at dump_callback invocation time.
	+ */
	+static volatile uintptr_t dump_observed_td_vnet;
	+static volatile bool dump_observed_set;
	+
	+static void
	+test_dump_callback_capture_td_vnet(struct eventlog_provider *provider __unused,
	+ void *arg __unused)
	+{
	+ dump_observed_td_vnet = (uintptr_t)curthread->td_vnet;
	+ dump_observed_set = true;
	+}
	+
	+/*
	+ * Regression test for NCD-9675.
	+ *
	+ * Pins down the framework contract that motivated the TCP fix: the eventlog
	+ * machinery invokes provider->dump_callback without setting curvnet.
	+ * Providers that touch per-vnet state must iterate vnets / set curvnet
	+ * themselves. The dump runs on a kernel taskqueue thread whose
	+ * td_vnet is NULL; the test subscribes, drains, and verifies the
	+ * callback's observed context.
	+ *
	+ * If a future change makes the framework set curvnet around the dump
	+ * callback, this test will fail and the change should be deliberate (and
	+ * accompanied by removing the per-provider VNET_FOREACH wrappers).
	+ */
	+KTEST_FUNC(dump_state_curvnet_not_set)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_subscriber *subscriber;
	+ struct test_callback_data *callback_data;
	+ int ret;
	+
	+ KTEST_LOG(ctx, "Verifying dump_callback runs with curvnet unset");
	+
	+ dump_observed_td_vnet = (uintptr_t)0x1;
	+ dump_observed_set = false;
	+
	+ provider = test_create_provider("test_ds_curvnet",
	+ test_dump_callback_capture_td_vnet, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+
	+ callback_data = malloc(sizeof(*callback_data), M_EVENTLOG_TEST,
	+ M_WAITOK \| M_ZERO);
	+ mtx_init(&callback_data->lock, "test_ds_curvnet", NULL, MTX_DEF);
	+ subscriber = eventlog_subscriber_create_callback(test_event_callback,
	+ callback_data);
	+ KTEST_NEQUAL(subscriber, NULL);
	+
	+ ret = eventlog_subscriber_add_subscription(subscriber,
	+ "test_ds_curvnet", EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF,
	+ EVENTLOG_SUBSCRIPTION_DUMP_STATE);
	+ KTEST_EQUAL(ret, 0);
	+
	+ /* Wait for the async dump task to finish before reading observed. */
	+ eventlog_subscriber_drain_dumps(subscriber);
	+
	+ KTEST_VERIFY(dump_observed_set);
	+ KTEST_VERIFY(dump_observed_td_vnet == 0);
	+
	+ eventlog_subscriber_destroy(subscriber);
	+ mtx_destroy(&callback_data->lock);
	+ free(callback_data, M_EVENTLOG_TEST);
	+ eventlog_provider_destroy(provider);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Verify _ENABLED macros skip disabled sessions during dump.
	+ */
	+KTEST_FUNC(dump_state_disabled_sessions)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_session enabled_session, disabled_session;
	+ struct eventlog_subscriber *subscriber;
	+ struct test_callback_data *callback_data;
	+
	+ KTEST_LOG(ctx, "Testing dump state skips disabled sessions");
	+
	+ dump_callback_invocations = 0;
	+ dump_test_session_count = 2;
	+
	+ provider = test_create_provider("test_ds_dis", test_dump_callback,
	+ NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+
	+ enabled_session = eventlog_session_create(provider, 1, true, NULL, 0);
	+ KTEST_NEQUAL(enabled_session, NULL);
	+ dump_test_sessions[0] = enabled_session;
	+
	+ disabled_session = eventlog_session_create(provider, 2, true, NULL, 0);
	+ KTEST_NEQUAL(disabled_session, NULL);
	+ eventlog_session_set_enabled(disabled_session, 0);
	+ dump_test_sessions[1] = disabled_session;
	+
	+ callback_data = malloc(sizeof(*callback_data), M_EVENTLOG_TEST,
	+ M_WAITOK \| M_ZERO);
	+ mtx_init(&callback_data->lock, "test_ds_dis", NULL, MTX_DEF);
	+ subscriber = eventlog_subscriber_create_callback(test_event_callback,
	+ callback_data);
	+ KTEST_NEQUAL(subscriber, NULL);
	+ KTEST_EQUAL(eventlog_subscriber_add_subscription(subscriber,
	+ "test_ds_dis", EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF,
	+ EVENTLOG_SUBSCRIPTION_DUMP_STATE), 0);
	+
	+ eventlog_subscriber_drain_dumps(subscriber);
	+
	+ KTEST_EQUAL(atomic_load_acq_32(&dump_callback_invocations), 1);
	+ /*
	+ * The dump callback writes to both sessions, but the disabled session's
	+ * effective_level is NONE so eventlog_event_write_impl's subscriber
	+ * filtering will drop those events. Only the enabled session's events
	+ * should arrive. We expect: SESSION_CREATE (from create) + 1 dump
	+ * event for the enabled session = at least 1 from the dump.
	+ */
	+ {
	+ uint32_t ec = atomic_load_acq_32(&callback_data->event_count);
	+ KTEST_LOG(ctx, "Received %u events (enabled+dump)", ec);
	+ KTEST_VERIFY(ec >= 1);
	+ }
	+
	+ dump_test_sessions[0] = NULL;
	+ dump_test_sessions[1] = NULL;
	+ eventlog_session_destroy(enabled_session);
	+ eventlog_session_destroy(disabled_session);
	+ eventlog_subscriber_destroy(subscriber);
	+ mtx_destroy(&callback_data->lock);
	+ free(callback_data, M_EVENTLOG_TEST);
	+ eventlog_provider_destroy(provider);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Async dump_state contract: the callback does not run on the
	+ * subscribing thread, subscribe returns before the dump finishes,
	+ * drain_dumps() / destroy() are the sync points, and re-subscribing
	+ * does not re-fire the dump. Shared scratch for the tests below.
	+ */
	+static volatile struct thread *async_dump_thread;
	+static volatile bool async_dump_observed;
	+static struct mtx async_dump_mtx;
	+static struct cv async_dump_cv;
	+static volatile bool async_dump_release;
	+static volatile uint32_t async_dump_runs;
	+
	+static void
	+async_dump_callback_record_thread(struct eventlog_provider *provider __unused,
	+ void *arg __unused)
	+{
	+ async_dump_thread = curthread;
	+ async_dump_observed = true;
	+ atomic_add_32(&async_dump_runs, 1);
	+}
	+
	+/*
	+ * Slow dump_callback: blocks until the test releases it via
	+ * async_dump_release. Used to put the dump task into a known
	+ * "in-flight" state so the test can race destroy / drain against it.
	+ */
	+static void
	+async_dump_callback_block(struct eventlog_provider *provider __unused,
	+ void *arg __unused)
	+{
	+ mtx_lock(&async_dump_mtx);
	+ atomic_add_32(&async_dump_runs, 1);
	+ while (!async_dump_release)
	+ cv_wait(&async_dump_cv, &async_dump_mtx);
	+ mtx_unlock(&async_dump_mtx);
	+}
	+
	+/*
	+ * Verifies dump_callback runs on a thread different from the subscriber's
	+ * own thread (i.e. the framework taskqueue thread).
	+ */
	+KTEST_FUNC(dump_state_async_runs_off_caller_thread)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *subscriber;
	+ struct test_callback_data *cd;
	+
	+ KTEST_LOG(ctx, "Verifying dump_callback runs on a different thread");
	+
	+ async_dump_thread = NULL;
	+ async_dump_observed = false;
	+ atomic_store_rel_32(&async_dump_runs, 0);
	+
	+ provider = test_create_provider("test_ds_async_thr",
	+ async_dump_callback_record_thread, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+
	+ session = eventlog_session_create(provider, 1, true, NULL, 0);
	+ KTEST_NEQUAL(session, NULL);
	+
	+ cd = malloc(sizeof(*cd), M_EVENTLOG_TEST, M_WAITOK \| M_ZERO);
	+ mtx_init(&cd->lock, "test_ds_async_thr", NULL, MTX_DEF);
	+ subscriber = eventlog_subscriber_create_callback(test_event_callback,
	+ cd);
	+ KTEST_NEQUAL(subscriber, NULL);
	+
	+ KTEST_EQUAL(eventlog_subscriber_add_subscription(subscriber,
	+ "test_ds_async_thr", EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF,
	+ EVENTLOG_SUBSCRIPTION_DUMP_STATE), 0);
	+
	+ eventlog_subscriber_drain_dumps(subscriber);
	+
	+ KTEST_VERIFY(async_dump_observed);
	+ KTEST_VERIFY(async_dump_thread != NULL);
	+ KTEST_VERIFY(async_dump_thread != curthread);
	+ KTEST_EQUAL(atomic_load_acq_32(&async_dump_runs), 1);
	+
	+ eventlog_session_destroy(session);
	+ eventlog_subscriber_destroy(subscriber);
	+ mtx_destroy(&cd->lock);
	+ free(cd, M_EVENTLOG_TEST);
	+ eventlog_provider_destroy(provider);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Verifies subscribe returns before a slow dump_callback finishes,
	+ * so providers can do expensive dump work without blocking the caller.
	+ */
	+KTEST_FUNC(dump_state_async_subscribe_returns_before_dump)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *subscriber;
	+ struct test_callback_data *cd;
	+
	+ KTEST_LOG(ctx, "Verifying subscribe returns before dump completes");
	+
	+ atomic_store_rel_32(&async_dump_runs, 0);
	+ mtx_init(&async_dump_mtx, "async_dump_mtx", NULL, MTX_DEF);
	+ cv_init(&async_dump_cv, "async_dump_cv");
	+ async_dump_release = false;
	+
	+ provider = test_create_provider("test_ds_async_block",
	+ async_dump_callback_block, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+
	+ session = eventlog_session_create(provider, 1, true, NULL, 0);
	+ KTEST_NEQUAL(session, NULL);
	+
	+ cd = malloc(sizeof(*cd), M_EVENTLOG_TEST, M_WAITOK \| M_ZERO);
	+ mtx_init(&cd->lock, "test_ds_async_block", NULL, MTX_DEF);
	+ subscriber = eventlog_subscriber_create_callback(test_event_callback,
	+ cd);
	+ KTEST_NEQUAL(subscriber, NULL);
	+
	+ /*
	+ * Subscribe enqueues a dump that will block in the callback. The
	+ * call must return promptly even though the dump is parked --
	+ * that's the whole point of the rework.
	+ */
	+ KTEST_EQUAL(eventlog_subscriber_add_subscription(subscriber,
	+ "test_ds_async_block", EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF,
	+ EVENTLOG_SUBSCRIPTION_DUMP_STATE), 0);
	+
	+ /* Release the dump so it can complete and decrement dump_pending. */
	+ mtx_lock(&async_dump_mtx);
	+ async_dump_release = true;
	+ cv_broadcast(&async_dump_cv);
	+ mtx_unlock(&async_dump_mtx);
	+
	+ eventlog_subscriber_drain_dumps(subscriber);
	+
	+ KTEST_EQUAL(atomic_load_acq_32(&async_dump_runs), 1);
	+
	+ eventlog_session_destroy(session);
	+ eventlog_subscriber_destroy(subscriber);
	+ mtx_destroy(&cd->lock);
	+ free(cd, M_EVENTLOG_TEST);
	+ eventlog_provider_destroy(provider);
	+ cv_destroy(&async_dump_cv);
	+ mtx_destroy(&async_dump_mtx);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Verifies eventlog_subscriber_destroy() implicitly drains pending
	+ * dump tasks rather than freeing memory out from under them. We
	+ * subscribe with a callback that blocks, kick off destroy in a
	+ * thread that then unblocks the dump, and confirm destroy waits
	+ * for it.
	+ */
	+
	+struct destroy_drain_thread_arg {
	+ struct eventlog_subscriber *subscriber;
	+ volatile bool started;
	+ volatile bool returned;
	+};
	+
	+static void
	+destroy_drain_thread(void *arg)
	+{
	+ struct destroy_drain_thread_arg *a = arg;
	+
	+ a->started = true;
	+ eventlog_subscriber_destroy(a->subscriber);
	+ a->returned = true;
	+ kthread_exit();
	+}
	+
	+KTEST_FUNC(dump_state_destroy_waits_for_dump)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *subscriber;
	+ struct test_callback_data *cd;
	+ struct destroy_drain_thread_arg arg;
	+ struct thread *td;
	+ int i;
	+
	+ KTEST_LOG(ctx, "Verifying destroy() drains in-flight dumps");
	+
	+ atomic_store_rel_32(&async_dump_runs, 0);
	+ mtx_init(&async_dump_mtx, "async_dump_mtx", NULL, MTX_DEF);
	+ cv_init(&async_dump_cv, "async_dump_cv");
	+ async_dump_release = false;
	+
	+ provider = test_create_provider("test_ds_destroy_drain",
	+ async_dump_callback_block, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+
	+ session = eventlog_session_create(provider, 1, true, NULL, 0);
	+ KTEST_NEQUAL(session, NULL);
	+
	+ cd = malloc(sizeof(*cd), M_EVENTLOG_TEST, M_WAITOK \| M_ZERO);
	+ mtx_init(&cd->lock, "test_ds_destroy_drain", NULL, MTX_DEF);
	+ subscriber = eventlog_subscriber_create_callback(test_event_callback,
	+ cd);
	+ KTEST_NEQUAL(subscriber, NULL);
	+
	+ KTEST_EQUAL(eventlog_subscriber_add_subscription(subscriber,
	+ "test_ds_destroy_drain", EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF,
	+ EVENTLOG_SUBSCRIPTION_DUMP_STATE), 0);
	+
	+ /* Wait for the dump task to actually start running (and block). */
	+ for (i = 0; i < 1000; i++) {
	+ if (atomic_load_acq_32(&async_dump_runs) == 1)
	+ break;
	+ pause("ds_run", 1);
	+ }
	+ KTEST_EQUAL(atomic_load_acq_32(&async_dump_runs), 1);
	+
	+ /*
	+ * Spawn a thread that calls destroy(). It must NOT return until
	+ * we release the dump callback below. We give it 100ms to prove
	+ * it's stuck waiting on dump_pending, then release the callback
	+ * and wait for destroy() to complete.
	+ */
	+ memset(&arg, 0, sizeof(arg));
	+ arg.subscriber = subscriber;
	+ KTEST_EQUAL(kthread_add(destroy_drain_thread, &arg, NULL, &td, 0, 0,
	+ "evl_ds_destroy_drain"), 0);
	+
	+ /* Wait for the destroy thread to start. */
	+ for (i = 0; i < 1000; i++) {
	+ if (arg.started)
	+ break;
	+ pause("ds_strt", 1);
	+ }
	+ KTEST_VERIFY(arg.started);
	+
	+ /*
	+ * Confirm destroy() is parked on dump_pending. If it had freed
	+ * the subscriber already, async_dump_callback_block (which is
	+ * still parked on the cv) would also have freed its mtx, and we
	+ * would have crashed. The fact that arg.returned is still false
	+ * after a generous wait is the signal.
	+ */
	+ pause("ds_park", hz / 10);
	+ KTEST_VERIFY(!arg.returned);
	+
	+ /* Release the dump and wait for destroy() to come back. */
	+ mtx_lock(&async_dump_mtx);
	+ async_dump_release = true;
	+ cv_broadcast(&async_dump_cv);
	+ mtx_unlock(&async_dump_mtx);
	+
	+ for (i = 0; i < 1000; i++) {
	+ if (arg.returned)
	+ break;
	+ pause("ds_done", 1);
	+ }
	+ KTEST_VERIFY(arg.returned);
	+
	+ eventlog_session_destroy(session);
	+ mtx_destroy(&cd->lock);
	+ free(cd, M_EVENTLOG_TEST);
	+ eventlog_provider_destroy(provider);
	+ cv_destroy(&async_dump_cv);
	+ mtx_destroy(&async_dump_mtx);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Verifies that re-subscribing an already-subscribed (provider, level,
	+ * keywords) does not re-fire the dump_callback. The replay is a
	+ * one-shot per first-time subscribe; the subscriber already has the
	+ * state from the original subscribe.
	+ */
	+KTEST_FUNC(dump_state_resubscribe_no_refire)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber *subscriber;
	+ struct test_callback_data *cd;
	+
	+ KTEST_LOG(ctx, "Verifying re-subscribe does not re-fire dump");
	+
	+ async_dump_thread = NULL;
	+ async_dump_observed = false;
	+ atomic_store_rel_32(&async_dump_runs, 0);
	+
	+ provider = test_create_provider("test_ds_resub",
	+ async_dump_callback_record_thread, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+
	+ session = eventlog_session_create(provider, 1, true, NULL, 0);
	+ KTEST_NEQUAL(session, NULL);
	+
	+ cd = malloc(sizeof(*cd), M_EVENTLOG_TEST, M_WAITOK \| M_ZERO);
	+ mtx_init(&cd->lock, "test_ds_resub", NULL, MTX_DEF);
	+ subscriber = eventlog_subscriber_create_callback(test_event_callback,
	+ cd);
	+ KTEST_NEQUAL(subscriber, NULL);
	+
	+ KTEST_EQUAL(eventlog_subscriber_add_subscription(subscriber,
	+ "test_ds_resub", EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF,
	+ EVENTLOG_SUBSCRIPTION_DUMP_STATE), 0);
	+ eventlog_subscriber_drain_dumps(subscriber);
	+ KTEST_EQUAL(atomic_load_acq_32(&async_dump_runs), 1);
	+
	+ /* Re-subscribe with different level/keywords -- update in place. */
	+ KTEST_EQUAL(eventlog_subscriber_add_subscription(subscriber,
	+ "test_ds_resub", EVENTLOG_LEVEL_INFO, 0xF0F0F0F0,
	+ EVENTLOG_SUBSCRIPTION_DUMP_STATE), 0);
	+ eventlog_subscriber_drain_dumps(subscriber);
	+ KTEST_EQUAL(atomic_load_acq_32(&async_dump_runs), 1);
	+
	+ eventlog_session_destroy(session);
	+ eventlog_subscriber_destroy(subscriber);
	+ mtx_destroy(&cd->lock);
	+ free(cd, M_EVENTLOG_TEST);
	+ eventlog_provider_destroy(provider);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Verifies the framework emits an EVENTLOG_DUMP_COMPLETE_ID event to
	+ * the requesting subscriber once the dump_callback returns. The
	+ * callback intentionally emits no events, so DUMP_COMPLETE is the
	+ * only thing the subscriber should see -- we check both event_count
	+ * and last_event_id to pin that down.
	+ *
	+ * Subscribers that did not request EVENTLOG_KEYWORD_SESSION must
	+ * NOT receive DUMP_COMPLETE; we verify this with a second subscriber
	+ * that subscribes with a non-session keyword mask.
	+ */
	+KTEST_FUNC(dump_state_emits_dump_complete)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_session *session;
	+ struct eventlog_subscriber with_session, without_session;
	+ struct test_callback_data cd_with, cd_without;
	+
	+ KTEST_LOG(ctx, "Verifying DUMP_COMPLETE emission and keyword filter");
	+
	+ async_dump_thread = NULL;
	+ async_dump_observed = false;
	+ atomic_store_rel_32(&async_dump_runs, 0);
	+
	+ provider = test_create_provider("test_ds_complete",
	+ async_dump_callback_record_thread, NULL);
	+ KTEST_NEQUAL(provider, NULL);
	+
	+ session = eventlog_session_create(provider, 1, true, NULL, 0);
	+ KTEST_NEQUAL(session, NULL);
	+
	+ cd_with = malloc(sizeof(*cd_with), M_EVENTLOG_TEST,
	+ M_WAITOK \| M_ZERO);
	+ mtx_init(&cd_with->lock, "test_ds_complete_w", NULL, MTX_DEF);
	+ with_session = eventlog_subscriber_create_callback(
	+ test_event_callback, cd_with);
	+ KTEST_NEQUAL(with_session, NULL);
	+
	+ cd_without = malloc(sizeof(*cd_without), M_EVENTLOG_TEST,
	+ M_WAITOK \| M_ZERO);
	+ mtx_init(&cd_without->lock, "test_ds_complete_wo", NULL, MTX_DEF);
	+ without_session = eventlog_subscriber_create_callback(
	+ test_event_callback, cd_without);
	+ KTEST_NEQUAL(without_session, NULL);
	+
	+ /* with_session: full mask -- includes EVENTLOG_KEYWORD_SESSION. */
	+ KTEST_EQUAL(eventlog_subscriber_add_subscription(with_session,
	+ "test_ds_complete", EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF,
	+ EVENTLOG_SUBSCRIPTION_DUMP_STATE), 0);
	+ /* without_session: SESSION bit (0x80000000) explicitly cleared. */
	+ KTEST_EQUAL(eventlog_subscriber_add_subscription(without_session,
	+ "test_ds_complete", EVENTLOG_LEVEL_VERBOSE, 0x7FFFFFFF,
	+ EVENTLOG_SUBSCRIPTION_DUMP_STATE), 0);
	+
	+ eventlog_subscriber_drain_dumps(with_session);
	+ eventlog_subscriber_drain_dumps(without_session);
	+
	+ /* dump_callback ran once for each subscriber */
	+ KTEST_EQUAL(atomic_load_acq_32(&async_dump_runs), 2);
	+
	+ /*
	+ * with_session should have received exactly one event --
	+ * the synthetic DUMP_COMPLETE. without_session should have
	+ * received nothing (SESSION keyword stripped).
	+ */
	+ KTEST_EQUAL(atomic_load_acq_32(&cd_with->event_count), 1);
	+ KTEST_EQUAL(atomic_load_acq_32(&cd_with->last_event_id),
	+ EVENTLOG_DUMP_COMPLETE_ID);
	+ KTEST_EQUAL(atomic_load_acq_32(&cd_without->event_count), 0);
	+
	+ eventlog_session_destroy(session);
	+ eventlog_subscriber_destroy(with_session);
	+ eventlog_subscriber_destroy(without_session);
	+ mtx_destroy(&cd_with->lock);
	+ free(cd_with, M_EVENTLOG_TEST);
	+ mtx_destroy(&cd_without->lock);
	+ free(cd_without, M_EVENTLOG_TEST);
	+ eventlog_provider_destroy(provider);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Multi-provider callback data: tracks events per provider_id to verify that
	+ * events from multiple same-named providers are all delivered.
	+ */
	+struct multi_provider_callback_data {
	+ volatile uint32_t event_count;
	+ volatile uint16_t seen_provider_ids[8];
	+ volatile uint32_t seen_provider_id_counts[8];
	+ volatile int num_distinct_providers;
	+};
	+
	+static void
	+multi_provider_callback(const struct eventlog_event_header *hdr,
	+ const char *provider_name __unused, uint8_t provider_name_len __unused,
	+ uint64_t session_id __unused,
	+ const struct iovec *iov __unused, int iovcnt __unused,
	+ size_t payload_size __unused, void *callback_arg)
	+{
	+ struct multi_provider_callback_data *data = callback_arg;
	+ int i, n;
	+
	+ atomic_add_int(&data->event_count, 1);
	+
	+ n = atomic_load_acq_int(&data->num_distinct_providers);
	+ for (i = 0; i < n; i++) {
	+ if (data->seen_provider_ids[i] == hdr->provider_id) {
	+ atomic_add_int(&data->seen_provider_id_counts[i], 1);
	+ return;
	+ }
	+ }
	+ /* New provider_id - add it (racy but fine for small test counts) */
	+ if (n < 8) {
	+ data->seen_provider_ids[n] = hdr->provider_id;
	+ data->seen_provider_id_counts[n] = 1;
	+ atomic_add_rel_int(&data->num_distinct_providers, 1);
	+ }
	+}
	+
	+/*
	+ * Subscribing by name enables ALL providers with that name.
	+ */
	+KTEST_FUNC(multi_provider_subscribe_enables_all)
	+{
	+ struct eventlog_provider p1, p2;
	+ struct eventlog_subscriber *subscriber;
	+ struct multi_provider_callback_data cb_data;
	+ int error;
	+
	+ KTEST_LOG(ctx,
	+ "Testing subscribe-by-name enables all matching providers");
	+
	+ p1 = test_create_provider("test_mp_en", NULL, NULL);
	+ KTEST_NEQUAL(p1, NULL);
	+ p2 = test_create_provider("test_mp_en", NULL, NULL);
	+ KTEST_NEQUAL(p2, NULL);
	+
	+ /* Both providers should start disabled */
	+ KTEST_EQUAL(eventlog_provider_get_level(p1), EVENTLOG_LEVEL_NONE);
	+ KTEST_EQUAL(eventlog_provider_get_level(p2), EVENTLOG_LEVEL_NONE);
	+
	+ memset(&cb_data, 0, sizeof(cb_data));
	+ subscriber = eventlog_subscriber_create_callback(
	+ multi_provider_callback, &cb_data);
	+ KTEST_NEQUAL(subscriber, NULL);
	+
	+ error = eventlog_subscriber_add_subscription(subscriber, "test_mp_en",
	+ EVENTLOG_LEVEL_INFO, 0x7, 0);
	+ KTEST_EQUAL(error, 0);
	+
	+ /* Both providers should now be enabled with the same level/keywords */
	+ KTEST_EQUAL(eventlog_provider_get_level(p1), EVENTLOG_LEVEL_INFO);
	+ KTEST_EQUAL(eventlog_provider_get_level(p2), EVENTLOG_LEVEL_INFO);
	+ KTEST_EQUAL(eventlog_provider_get_keywords(p1), 0x7);
	+ KTEST_EQUAL(eventlog_provider_get_keywords(p2), 0x7);
	+
	+ /* Destroying subscriber should disable both */
	+ eventlog_subscriber_destroy(subscriber);
	+ KTEST_EQUAL(eventlog_provider_get_level(p1), EVENTLOG_LEVEL_NONE);
	+ KTEST_EQUAL(eventlog_provider_get_level(p2), EVENTLOG_LEVEL_NONE);
	+ KTEST_EQUAL(eventlog_provider_get_keywords(p1), 0);
	+ KTEST_EQUAL(eventlog_provider_get_keywords(p2), 0);
	+
	+ eventlog_provider_destroy(p1);
	+ eventlog_provider_destroy(p2);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Events from both same-named providers reach a single subscriber,
	+ * and they carry distinct provider_ids.
	+ */
	+KTEST_FUNC(multi_provider_events_from_both)
	+{
	+ struct eventlog_provider p1, p2;
	+ struct eventlog_session s1, s2;
	+ struct eventlog_subscriber *subscriber;
	+ struct multi_provider_callback_data cb_data;
	+ uint32_t payload = 0xCAFE;
	+ int error;
	+
	+ KTEST_LOG(ctx, "Testing events from both same-named providers");
	+
	+ p1 = test_create_provider("test_mp_ev", NULL, NULL);
	+ KTEST_NEQUAL(p1, NULL);
	+ p2 = test_create_provider("test_mp_ev", NULL, NULL);
	+ KTEST_NEQUAL(p2, NULL);
	+
	+ memset(&cb_data, 0, sizeof(cb_data));
	+ subscriber = eventlog_subscriber_create_callback(
	+ multi_provider_callback, &cb_data);
	+ KTEST_NEQUAL(subscriber, NULL);
	+ error = eventlog_subscriber_add_subscription(subscriber, "test_mp_ev",
	+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, 0);
	+ KTEST_EQUAL(error, 0);
	+
	+ s1 = eventlog_session_create(p1, 100, true, NULL, 0);
	+ KTEST_NEQUAL(s1, NULL);
	+ s2 = eventlog_session_create(p2, 200, true, NULL, 0);
	+ KTEST_NEQUAL(s2, NULL);
	+
	+ /* Write events from each provider */
	+ eventlog_event_write(s1, 0x1001, EVENTLOG_LEVEL_INFO, 0xFFFFFFFF,
	+ &payload, sizeof(payload));
	+ eventlog_event_write(s2, 0x2001, EVENTLOG_LEVEL_INFO, 0xFFFFFFFF,
	+ &payload, sizeof(payload));
	+ eventlog_event_write(s1, 0x1002, EVENTLOG_LEVEL_INFO, 0xFFFFFFFF,
	+ &payload, sizeof(payload));
	+ eventlog_event_write(s2, 0x2002, EVENTLOG_LEVEL_INFO, 0xFFFFFFFF,
	+ &payload, sizeof(payload));
	+
	+ /* 2 SESSION_CREATEs + 4 user events = 6 total */
	+ KTEST_EQUAL(atomic_load_acq_32(&cb_data.event_count), 6);
	+
	+ /* Events should have come from 2 distinct provider_ids */
	+ KTEST_EQUAL(atomic_load_acq_int(&cb_data.num_distinct_providers), 2);
	+ /* Each provider sent 3 events (1 SESSION_CREATE + 2 user) */
	+ KTEST_EQUAL(atomic_load_acq_32(&cb_data.seen_provider_id_counts[0]), 3);
	+ KTEST_EQUAL(atomic_load_acq_32(&cb_data.seen_provider_id_counts[1]), 3);
	+
	+ eventlog_session_destroy(s1);
	+ eventlog_session_destroy(s2);
	+ eventlog_subscriber_destroy(subscriber);
	+ eventlog_provider_destroy(p1);
	+ eventlog_provider_destroy(p2);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Destroying one same-named provider doesn't affect the other.
	+ * Subscription and event delivery continue for the surviving provider.
	+ */
	+KTEST_FUNC(multi_provider_destroy_one)
	+{
	+ struct eventlog_provider p1, p2;
	+ struct eventlog_session s1, s2;
	+ struct eventlog_subscriber *subscriber;
	+ struct multi_provider_callback_data cb_data;
	+ uint32_t payload = 0xBEEF;
	+ uint32_t count_before;
	+ int error;
	+
	+ KTEST_LOG(ctx, "Testing destroy one of two same-named providers");
	+
	+ p1 = test_create_provider("test_mp_d1", NULL, NULL);
	+ KTEST_NEQUAL(p1, NULL);
	+ p2 = test_create_provider("test_mp_d1", NULL, NULL);
	+ KTEST_NEQUAL(p2, NULL);
	+
	+ memset(&cb_data, 0, sizeof(cb_data));
	+ subscriber = eventlog_subscriber_create_callback(
	+ multi_provider_callback, &cb_data);
	+ KTEST_NEQUAL(subscriber, NULL);
	+ error = eventlog_subscriber_add_subscription(subscriber, "test_mp_d1",
	+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, 0);
	+ KTEST_EQUAL(error, 0);
	+
	+ s1 = eventlog_session_create(p1, 1, true, NULL, 0);
	+ KTEST_NEQUAL(s1, NULL);
	+ s2 = eventlog_session_create(p2, 2, true, NULL, 0);
	+ KTEST_NEQUAL(s2, NULL);
	+
	+ /* Write an event from each */
	+ eventlog_event_write(s1, 0x1001, EVENTLOG_LEVEL_INFO, 0xFFFFFFFF,
	+ &payload, sizeof(payload));
	+ eventlog_event_write(s2, 0x2001, EVENTLOG_LEVEL_INFO, 0xFFFFFFFF,
	+ &payload, sizeof(payload));
	+
	+ /*
	+ * Destroy s1 and subscriber, then p1.
	+ * Subscriber must be destroyed before its providers so that
	+ * subscription pointers are cleaned up first.
	+ */
	+ eventlog_session_destroy(s1);
	+ eventlog_subscriber_destroy(subscriber);
	+ eventlog_provider_destroy(p1);
	+
	+ /* p2 should now be disabled (no subscribers left) */
	+ KTEST_EQUAL(eventlog_provider_get_level(p2), EVENTLOG_LEVEL_NONE);
	+ KTEST_EQUAL(eventlog_provider_get_keywords(p2), 0);
	+
	+ /* Re-subscribe to verify p2 still works after p1 is gone */
	+ memset(&cb_data, 0, sizeof(cb_data));
	+ subscriber = eventlog_subscriber_create_callback(
	+ multi_provider_callback, &cb_data);
	+ KTEST_NEQUAL(subscriber, NULL);
	+ error = eventlog_subscriber_add_subscription(subscriber, "test_mp_d1",
	+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, 0);
	+ KTEST_EQUAL(error, 0);
	+
	+ /* p2 should be enabled again */
	+ KTEST_EQUAL(eventlog_provider_get_level(p2), EVENTLOG_LEVEL_VERBOSE);
	+ KTEST_EQUAL(eventlog_provider_get_keywords(p2), 0xFFFFFFFF);
	+
	+ /* Events from p2 should arrive */
	+ count_before = atomic_load_acq_32(&cb_data.event_count);
	+ eventlog_event_write(s2, 0x2002, EVENTLOG_LEVEL_INFO, 0xFFFFFFFF,
	+ &payload, sizeof(payload));
	+ KTEST_EQUAL(atomic_load_acq_32(&cb_data.event_count), count_before + 1);
	+
	+ eventlog_session_destroy(s2);
	+ eventlog_subscriber_destroy(subscriber);
	+ eventlog_provider_destroy(p2);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Dump state callback is invoked for each matching provider when subscribing
	+ * by name with DUMP_STATE.
	+ */
	+static volatile uint32_t mp_dump_invocations;
	+static struct eventlog_session *mp_dump_sessions[4];
	+static int mp_dump_session_count;
	+
	+static void
	+mp_test_dump_callback(struct eventlog_provider *provider __unused,
	+ void *arg __unused)
	+{
	+ int i;
	+
	+ atomic_add_int(&mp_dump_invocations, 1);
	+ for (i = 0; i < mp_dump_session_count; i++) {
	+ if (mp_dump_sessions[i] != NULL &&
	+ mp_dump_sessions[i]->effective_level >=
	+ EVENTLOG_LEVEL_INFO) {
	+ uint32_t data = 0xdead0000 \| i;
	+ eventlog_event_write(mp_dump_sessions[i], 0x200 + i,
	+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF,
	+ &data, sizeof(data));
	+ }
	+ }
	+}
	+
	+KTEST_FUNC(multi_provider_dump_state)
	+{
	+ struct eventlog_provider p1 = NULL, p2 = NULL;
	+ struct eventlog_session s1 = NULL, s2 = NULL;
	+ struct eventlog_subscriber *subscriber = NULL;
	+ struct multi_provider_callback_data cb_data;
	+ uint32_t invocations, ec;
	+ int ret = 0;
	+ int error;
	+
	+ KTEST_LOG(ctx, "Testing dump state invoked for each matching provider");
	+
	+ mp_dump_invocations = 0;
	+ mp_dump_session_count = 2;
	+
	+ p1 = test_create_provider("test_mp_ds", mp_test_dump_callback, NULL);
	+ KTEST_NEQUAL(p1, NULL);
	+ p2 = test_create_provider("test_mp_ds", mp_test_dump_callback, NULL);
	+ KTEST_NEQUAL(p2, NULL);
	+
	+ s1 = eventlog_session_create(p1, 1, true, NULL, 0);
	+ KTEST_NEQUAL(s1, NULL);
	+ mp_dump_sessions[0] = s1;
	+
	+ s2 = eventlog_session_create(p2, 2, true, NULL, 0);
	+ KTEST_NEQUAL(s2, NULL);
	+ mp_dump_sessions[1] = s2;
	+
	+ memset(&cb_data, 0, sizeof(cb_data));
	+ subscriber = eventlog_subscriber_create_callback(
	+ multi_provider_callback, &cb_data);
	+ KTEST_NEQUAL(subscriber, NULL);
	+
	+ error = eventlog_subscriber_add_subscription(subscriber, "test_mp_ds",
	+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF,
	+ EVENTLOG_SUBSCRIPTION_DUMP_STATE);
	+ if (error != 0) {
	+ KTEST_ERR(ctx, "FAIL: add_subscription returned %d", error);
	+ ret = EINVAL;
	+ goto cleanup;
	+ }
	+ KTEST_LOG(ctx, "PASS: error == 0");
	+
	+ /*
	+ * Two providers share the name, so two dump tasks were enqueued.
	+ * Drain so the invocation/event-count assertions below see the
	+ * post-dump steady state.
	+ */
	+ eventlog_subscriber_drain_dumps(subscriber);
	+
	+ invocations = atomic_load_acq_32(&mp_dump_invocations);
	+ KTEST_LOG(ctx, "Dump callback invoked %u times", invocations);
	+ if (invocations < 2) {
	+ KTEST_ERR(ctx, "FAIL: dump invocations %u < 2", invocations);
	+ ret = EINVAL;
	+ goto cleanup;
	+ }
	+ KTEST_LOG(ctx, "PASS: invocations >= 2");
	+
	+ ec = atomic_load_acq_32(&cb_data.event_count);
	+ KTEST_LOG(ctx, "Subscriber received %u events", ec);
	+ if (ec < 2) {
	+ KTEST_ERR(ctx, "FAIL: event_count %u < 2", ec);
	+ ret = EINVAL;
	+ goto cleanup;
	+ }
	+ KTEST_LOG(ctx, "PASS: event_count >= 2");
	+
	+ if (atomic_load_acq_int(&cb_data.num_distinct_providers) != 2) {
	+ KTEST_ERR(ctx, "FAIL: num_distinct_providers %d != 2",
	+ atomic_load_acq_int(&cb_data.num_distinct_providers));
	+ ret = EINVAL;
	+ goto cleanup;
	+ }
	+ KTEST_LOG(ctx, "PASS: num_distinct_providers == 2");
	+
	+cleanup:
	+ mp_dump_sessions[0] = NULL;
	+ mp_dump_sessions[1] = NULL;
	+ if (s1 != NULL)
	+ eventlog_session_destroy(s1);
	+ if (s2 != NULL)
	+ eventlog_session_destroy(s2);
	+ if (subscriber != NULL)
	+ eventlog_subscriber_destroy(subscriber);
	+ if (p1 != NULL)
	+ eventlog_provider_destroy(p1);
	+ if (p2 != NULL)
	+ eventlog_provider_destroy(p2);
	+
	+ return (ret);
	+}
	+
	+/*
	+ * Two subscribers with different filters targeting same-named providers.
	+ * Each provider instance gets its enablement from the union of all subscribers.
	+ */
	+KTEST_FUNC(multi_provider_independent_enablement)
	+{
	+ struct eventlog_provider p1, p2;
	+ struct eventlog_subscriber sub_name, sub_other;
	+ struct multi_provider_callback_data cb1, cb2;
	+ int error;
	+
	+ KTEST_LOG(ctx, "Testing per-provider enablement with multi-provider");
	+
	+ p1 = test_create_provider("test_mp_ie", NULL, NULL);
	+ KTEST_NEQUAL(p1, NULL);
	+ p2 = test_create_provider("test_mp_ie", NULL, NULL);
	+ KTEST_NEQUAL(p2, NULL);
	+
	+ /* Subscribe to "test_mp_ie" at INFO/0x3 - enables both providers */
	+ memset(&cb1, 0, sizeof(cb1));
	+ sub_name = eventlog_subscriber_create_callback(multi_provider_callback,
	+ &cb1);
	+ KTEST_NEQUAL(sub_name, NULL);
	+ error = eventlog_subscriber_add_subscription(sub_name, "test_mp_ie",
	+ EVENTLOG_LEVEL_INFO, 0x3, 0);
	+ KTEST_EQUAL(error, 0);
	+
	+ KTEST_EQUAL(eventlog_provider_get_level(p1), EVENTLOG_LEVEL_INFO);
	+ KTEST_EQUAL(eventlog_provider_get_level(p2), EVENTLOG_LEVEL_INFO);
	+ KTEST_EQUAL(eventlog_provider_get_keywords(p1), 0x3);
	+ KTEST_EQUAL(eventlog_provider_get_keywords(p2), 0x3);
	+
	+ /* Add second subscriber at VERBOSE/0xC - both get union */
	+ memset(&cb2, 0, sizeof(cb2));
	+ sub_other = eventlog_subscriber_create_callback(multi_provider_callback,
	+ &cb2);
	+ KTEST_NEQUAL(sub_other, NULL);
	+ error = eventlog_subscriber_add_subscription(sub_other, "test_mp_ie",
	+ EVENTLOG_LEVEL_VERBOSE, 0xC, 0);
	+ KTEST_EQUAL(error, 0);
	+
	+ KTEST_EQUAL(eventlog_provider_get_level(p1), EVENTLOG_LEVEL_VERBOSE);
	+ KTEST_EQUAL(eventlog_provider_get_level(p2), EVENTLOG_LEVEL_VERBOSE);
	+ KTEST_EQUAL(eventlog_provider_get_keywords(p1), 0xF); /* 0x3 \| 0xC */
	+ KTEST_EQUAL(eventlog_provider_get_keywords(p2), 0xF);
	+
	+ /* Remove first subscriber - enablement drops to VERBOSE/0xC */
	+ eventlog_subscriber_destroy(sub_name);
	+
	+ KTEST_EQUAL(eventlog_provider_get_level(p1), EVENTLOG_LEVEL_VERBOSE);
	+ KTEST_EQUAL(eventlog_provider_get_level(p2), EVENTLOG_LEVEL_VERBOSE);
	+ KTEST_EQUAL(eventlog_provider_get_keywords(p1), 0xC);
	+ KTEST_EQUAL(eventlog_provider_get_keywords(p2), 0xC);
	+
	+ /* Remove second subscriber - both disabled */
	+ eventlog_subscriber_destroy(sub_other);
	+
	+ KTEST_EQUAL(eventlog_provider_get_level(p1), EVENTLOG_LEVEL_NONE);
	+ KTEST_EQUAL(eventlog_provider_get_level(p2), EVENTLOG_LEVEL_NONE);
	+
	+ eventlog_provider_destroy(p1);
	+ eventlog_provider_destroy(p2);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Device subscriber receives events from all same-named providers.
	+ * Verifies multi-provider support works with device (buffered) subscribers,
	+ * not just callback subscribers.
	+ */
	+KTEST_FUNC(multi_provider_device_subscriber)
	+{
	+ struct eventlog_provider p1, p2;
	+ struct eventlog_session s1, s2;
	+ struct eventlog_subscriber *subscriber;
	+ uint32_t payload = 0xFACE;
	+ char read_buf[8 * 1024];
	+ size_t total_read;
	+ struct eventlog_event_header *hdr;
	+ uint16_t seen_ids[2] = {0, 0};
	+ int num_distinct = 0;
	+ int total_events = 0;
	+ size_t offset;
	+ int error, i;
	+
	+ KTEST_LOG(ctx,
	+ "Testing device subscriber with multiple same-named providers");
	+
	+ p1 = test_create_provider("test_mp_dev", NULL, NULL);
	+ KTEST_NEQUAL(p1, NULL);
	+ p2 = test_create_provider("test_mp_dev", NULL, NULL);
	+ KTEST_NEQUAL(p2, NULL);
	+
	+ subscriber = eventlog_subscriber_create_device(
	+ EVENTLOG_SUBSCRIBER_BUFFER_SIZE_DEFAULT);
	+ KTEST_NEQUAL(subscriber, NULL);
	+ error = eventlog_subscriber_add_subscription(subscriber, "test_mp_dev",
	+ EVENTLOG_LEVEL_VERBOSE, 0xFFFFFFFF, 0);
	+ KTEST_EQUAL(error, 0);
	+
	+ s1 = eventlog_session_create(p1, 1, true, NULL, 0);
	+ KTEST_NEQUAL(s1, NULL);
	+ s2 = eventlog_session_create(p2, 2, true, NULL, 0);
	+ KTEST_NEQUAL(s2, NULL);
	+
	+ /* Write events from each provider */
	+ for (i = 0; i < 5; i++) {
	+ eventlog_event_write(s1, 0x1000 + i, EVENTLOG_LEVEL_INFO,
	+ 0xFFFFFFFF, &payload, sizeof(payload));
	+ eventlog_event_write(s2, 0x2000 + i, EVENTLOG_LEVEL_INFO,
	+ 0xFFFFFFFF, &payload, sizeof(payload));
	+ }
	+
	+ /* Read all available events */
	+ total_read = eventlog_read_into_buf(subscriber, read_buf,
	+ sizeof(read_buf), FNONBLOCK);
	+ KTEST_VERIFY(total_read > 0);
	+
	+ /* Parse events and count distinct provider_ids */
	+ offset = 0;
	+ while (offset + sizeof(struct eventlog_event_header) <= total_read) {
	+ bool found;
	+
	+ hdr = (struct eventlog_event_header *)(read_buf + offset);
	+ if (hdr->event_length < sizeof(struct eventlog_event_header) \|\|
	+ offset + hdr->event_length > total_read)
	+ break;
	+
	+ total_events++;
	+ found = false;
	+ for (i = 0; i < num_distinct; i++) {
	+ if (seen_ids[i] == hdr->provider_id) {
	+ found = true;
	+ break;
	+ }
	+ }
	+ if (!found && num_distinct < 2) {
	+ seen_ids[num_distinct++] = hdr->provider_id;
	+ }
	+ offset += hdr->event_length;
	+ }
	+
	+ KTEST_LOG(ctx, "Read %d events from %d distinct provider_ids",
	+ total_events, num_distinct);
	+
	+ /* 2 SESSION_CREATEs + 10 user events = 12 total */
	+ KTEST_EQUAL(total_events, 12);
	+ KTEST_EQUAL(num_distinct, 2);
	+
	+ eventlog_session_destroy(s1);
	+ eventlog_session_destroy(s2);
	+ eventlog_subscriber_destroy(subscriber);
	+ eventlog_provider_destroy(p1);
	+ eventlog_provider_destroy(p2);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Helpers + tests for the subscribers_changed provider callback and
	+ * eventlog_provider_config (NULL config, default_enabled, etc).
	+ *
	+ * Contract for subscribers_changed: fires exactly once per real
	+ * 0<->N transition, runs without sessions_lock so the callback may
	+ * sleep, NULL is a safe "no callback" value.
	+ */
	+
	+struct subch_count {
	+ volatile int n_true; /* callbacks with has_subscribers=true */
	+ volatile int n_false; /* ...with has_subscribers=false */
	+ volatile int last_state;
	+};
	+
	+static void
	+test_subch_count_cb(struct eventlog_provider *provider __unused,
	+ bool has_subscribers, void *arg)
	+{
	+ struct subch_count *c = arg;
	+
	+ if (has_subscribers)
	+ atomic_add_int(&c->n_true, 1);
	+ else
	+ atomic_add_int(&c->n_false, 1);
	+ atomic_store_rel_32((volatile uint32_t *)&c->last_state,
	+ has_subscribers ? 1 : 0);
	+}
	+
	+KTEST_FUNC(subscribers_changed_basic)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_subscriber sub1, sub2;
	+ struct test_callback_data cb1, cb2;
	+ struct subch_count c = { 0, 0, 0 };
	+ struct eventlog_provider_config cfg = {
	+ .subscribers_changed = test_subch_count_cb,
	+ .subscribers_changed_arg = &c,
	+ };
	+
	+ KTEST_LOG(ctx, "subscribers_changed fires exactly once per 0<->N edge");
	+
	+ provider = eventlog_provider_create("test_subch_basic", &cfg);
	+ KTEST_NEQUAL(provider, NULL);
	+
	+ /* No subscriber yet -> no callback ever fired. */
	+ KTEST_EQUAL(c.n_true, 0);
	+ KTEST_EQUAL(c.n_false, 0);
	+
	+ /* First subscriber: 0->1 transition, expect one (true). */
	+ sub1 = test_enable_provider_callback("test_subch_basic",
	+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF, &cb1);
	+ KTEST_NEQUAL(sub1, NULL);
	+ KTEST_EQUAL(c.n_true, 1);
	+ KTEST_EQUAL(c.n_false, 0);
	+ KTEST_EQUAL(c.last_state, 1);
	+
	+ /* Second subscriber: 1->2, no transition, no callback. */
	+ sub2 = test_enable_provider_callback("test_subch_basic",
	+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF, &cb2);
	+ KTEST_NEQUAL(sub2, NULL);
	+ KTEST_EQUAL(c.n_true, 1);
	+ KTEST_EQUAL(c.n_false, 0);
	+
	+ /* Drop one subscriber: 2->1, no transition, no callback. */
	+ eventlog_subscriber_destroy(sub2);
	+ mtx_destroy(&cb2->lock);
	+ free(cb2, M_EVENTLOG_TEST);
	+ KTEST_EQUAL(c.n_true, 1);
	+ KTEST_EQUAL(c.n_false, 0);
	+
	+ /* Drop the last subscriber: 1->0, expect one (false). */
	+ eventlog_subscriber_destroy(sub1);
	+ mtx_destroy(&cb1->lock);
	+ free(cb1, M_EVENTLOG_TEST);
	+ KTEST_EQUAL(c.n_true, 1);
	+ KTEST_EQUAL(c.n_false, 1);
	+ KTEST_EQUAL(c.last_state, 0);
	+
	+ eventlog_provider_destroy(provider);
	+ return (0);
	+}
	+
	+KTEST_FUNC(subscribers_changed_null_safe)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_subscriber *sub;
	+ struct test_callback_data *cb;
	+ struct eventlog_provider_config cfg = {
	+ /* subscribers_changed deliberately NULL */
	+ };
	+
	+ KTEST_LOG(ctx, "NULL subscribers_changed is a safe no-op");
	+
	+ provider = eventlog_provider_create("test_subch_null", &cfg);
	+ KTEST_NEQUAL(provider, NULL);
	+
	+ /* Exercise sub/unsub cycle; NULL callback should not crash. */
	+ sub = test_enable_provider_callback("test_subch_null",
	+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF, &cb);
	+ KTEST_NEQUAL(sub, NULL);
	+
	+ eventlog_subscriber_destroy(sub);
	+ mtx_destroy(&cb->lock);
	+ free(cb, M_EVENTLOG_TEST);
	+
	+ eventlog_provider_destroy(provider);
	+ return (0);
	+}
	+
	+/*
	+ * subscribers_changed_runs_unlocked: prove the callback is invoked in a
	+ * context where the caller is permitted to sleep / take its own
	+ * sleepable locks. If sessions_lock (or any non-sleepable lock) were
	+ * held when the callback fired, sx_xlock + pause_sbt would WITNESS- /
	+ * INVARIANTS-fail with "sleeping with mutex held".
	+ */
	+struct subch_unlocked_state {
	+ struct sx outer;
	+ int n;
	+};
	+
	+static void
	+test_subch_unlocked_cb(struct eventlog_provider *provider __unused,
	+ bool has_subscribers __unused, void *arg)
	+{
	+ struct subch_unlocked_state *s = arg;
	+
	+ MPASS(THREAD_CAN_SLEEP());
	+ sx_xlock(&s->outer);
	+ /*
	+ * Sleep one tick. WITNESS / INVARIANTS will fire if any
	+ * non-sleepable lock is held (most importantly sessions_lock).
	+ */
	+ pause("subch", 1);
	+ sx_xunlock(&s->outer);
	+ atomic_add_int(&s->n, 1);
	+}
	+
	+KTEST_FUNC(subscribers_changed_runs_unlocked)
	+{
	+ struct eventlog_provider *provider;
	+ struct eventlog_subscriber *sub;
	+ struct test_callback_data *cb;
	+ struct subch_unlocked_state s;
	+ struct eventlog_provider_config cfg;
	+
	+ KTEST_LOG(ctx,
	+ "callback runs outside sessions_lock (sleepable context)");
	+
	+ bzero(&s, sizeof(s));
	+ sx_init(&s.outer, "test_subch_outer");
	+ cfg = (struct eventlog_provider_config){
	+ .subscribers_changed = test_subch_unlocked_cb,
	+ .subscribers_changed_arg = &s,
	+ };
	+
	+ provider = eventlog_provider_create("test_subch_unlocked", &cfg);
	+ KTEST_NEQUAL(provider, NULL);
	+
	+ sub = test_enable_provider_callback("test_subch_unlocked",
	+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF, &cb);
	+ KTEST_NEQUAL(sub, NULL);
	+ KTEST_EQUAL(s.n, 1);
	+
	+ eventlog_subscriber_destroy(sub);
	+ mtx_destroy(&cb->lock);
	+ free(cb, M_EVENTLOG_TEST);
	+ KTEST_EQUAL(s.n, 2);
	+
	+ eventlog_provider_destroy(provider);
	+ sx_destroy(&s.outer);
	+ return (0);
	+}
	+
	+/*
	+ * Concurrent subscribe / unsubscribe storm. Smoke test for
	+ * eventlog_update_provider_enablement under contention. Starts and
	+ * ends at the no-subscribers state, so n_true must equal n_false at
	+ * quiesce; INVARIANTS-only MPASS checks backstop subtler races.
	+ */
	+struct subch_storm_args {
	+ struct eventlog_provider *provider;
	+ const char *provider_name;
	+ int *stop;
	+ int iterations_done;
	+ int exited;
	+};
	+
	+static void
	+test_subch_storm_thread(void *arg)
	+{
	+ struct subch_storm_args *a = arg;
	+ struct eventlog_subscriber *sub;
	+ struct test_callback_data *cb;
	+ int i = 0;
	+
	+ while (atomic_load_acq_32((volatile uint32_t *)a->stop) == 0) {
	+ sub = test_enable_provider_callback(a->provider_name,
	+ EVENTLOG_LEVEL_INFO, 0xFFFFFFFF, &cb);
	+ if (sub == NULL)
	+ break;
	+ eventlog_subscriber_destroy(sub);
	+ mtx_destroy(&cb->lock);
	+ free(cb, M_EVENTLOG_TEST);
	+ i++;
	+ kern_yield(PRI_UNCHANGED);
	+ }
	+ atomic_store_rel_32((volatile uint32_t *)&a->iterations_done, i);
	+ atomic_store_rel_32((volatile uint32_t *)&a->exited, 1);
	+ wakeup(&a->exited);
	+ kthread_exit();
	+}
	+
	+static void
	+test_subch_stop_callout(void *arg)
	+{
	+ int *stop = arg;
	+
	+ atomic_store_rel_32((volatile uint32_t *)stop, 1);
	+ wakeup(stop);
	+}
	+
	+KTEST_FUNC(subscribers_changed_concurrent_subunsub)
	+{
	+#define SUBCH_NTHREADS 8
	+#define SUBCH_RUNTIME_S 1
	+ struct eventlog_provider *provider;
	+ struct subch_count c = { 0, 0, 0 };
	+ struct subch_storm_args args[SUBCH_NTHREADS];
	+ struct thread *threads[SUBCH_NTHREADS];
	+ struct callout stop_co;
	+ int stop = 0;
	+ int i, error;
	+ int total_iterations;
	+ struct eventlog_provider_config cfg = {
	+ .subscribers_changed = test_subch_count_cb,
	+ .subscribers_changed_arg = &c,
	+ };
	+
	+ KTEST_LOG(ctx, "concurrent sub/unsub: %d threads x %d s, no phantom "
	+ "transitions, n_true == n_false at quiesce",
	+ SUBCH_NTHREADS, SUBCH_RUNTIME_S);
	+
	+ provider = eventlog_provider_create("test_subch_storm", &cfg);
	+ KTEST_NEQUAL(provider, NULL);
	+
	+ for (i = 0; i < SUBCH_NTHREADS; i++) {
	+ bzero(&args[i], sizeof(args[i]));
	+ args[i].provider = provider;
	+ args[i].provider_name = "test_subch_storm";
	+ args[i].stop = &stop;
	+ error = kthread_add(test_subch_storm_thread, &args[i], NULL,
	+ &threads[i], 0, 0, "subch_storm_%d", i);
	+ KTEST_EQUAL(error, 0);
	+ }
	+
	+ callout_init(&stop_co, 1);
	+ callout_reset(&stop_co, hz * SUBCH_RUNTIME_S, test_subch_stop_callout,
	+ &stop);
	+
	+ for (i = 0; i < SUBCH_NTHREADS; i++) {
	+ while (atomic_load_acq_32(
	+ (volatile uint32_t *)&args[i].exited) == 0)
	+ tsleep(&args[i].exited, 0, "subch_w", hz / 10);
	+ }
	+ callout_drain(&stop_co);
	+
	+ total_iterations = 0;
	+ for (i = 0; i < SUBCH_NTHREADS; i++)
	+ total_iterations += args[i].iterations_done;
	+ KTEST_LOG(ctx, "total sub/unsub iterations: %d, n_true=%d n_false=%d",
	+ total_iterations, c.n_true, c.n_false);
	+
	+ /*
	+ * All subscribers are gone, so every 0->N edge (n_true) must
	+ * have a matching N->0 edge (n_false). Without locking around
	+ * the recount, races could produce unbalanced counts.
	+ */
	+ KTEST_VERIFY(c.n_true > 0);
	+ KTEST_VERIFY(c.n_false > 0);
	+ KTEST_EQUAL(c.n_true, c.n_false);
	+ KTEST_EQUAL(c.last_state, 0);
	+
	+ eventlog_provider_destroy(provider);
	+ return (0);
	+#undef SUBCH_NTHREADS
	+#undef SUBCH_RUNTIME_S
	+}
	+
	+/*
	+ * NULL config must be equivalent to a zero-initialised struct: no
	+ * callbacks, default_enabled == 0.
	+ */
	+KTEST_FUNC(provider_config_null_equivalent)
	+{
	+ struct eventlog_provider p_null, p_zero;
	+ struct eventlog_session s_null, s_zero;
	+ struct eventlog_provider_config cfg_zero = { 0 };
	+
	+ KTEST_LOG(ctx, "NULL config behaves identically to {0}");
	+
	+ p_null = eventlog_provider_create("test_cfg_null", NULL);
	+ KTEST_NEQUAL(p_null, NULL);
	+ p_zero = eventlog_provider_create("test_cfg_zero", &cfg_zero);
	+ KTEST_NEQUAL(p_zero, NULL);
	+
	+ /* Both providers default to disabled (default_enabled == 0). */
	+ KTEST_EQUAL(eventlog_provider_get_default(p_null), 0);
	+ KTEST_EQUAL(eventlog_provider_get_default(p_zero), 0);
	+
	+ /* Sessions on either start disabled. */
	+ s_null = eventlog_session_create(p_null, 0, true, NULL, 0);
	+ KTEST_NEQUAL(s_null, NULL);
	+ KTEST_EQUAL(eventlog_session_is_enabled(s_null), 0);
	+ s_zero = eventlog_session_create(p_zero, 0, true, NULL, 0);
	+ KTEST_NEQUAL(s_zero, NULL);
	+ KTEST_EQUAL(eventlog_session_is_enabled(s_zero), 0);
	+
	+ eventlog_session_destroy(s_null);
	+ eventlog_session_destroy(s_zero);
	+ eventlog_provider_destroy(p_null);
	+ eventlog_provider_destroy(p_zero);
	+ return (0);
	+}
	+
	+/*
	+ * cfg.default_enabled = 1 must cause sessions to start enabled
	+ * without an explicit eventlog_session_set_enabled call. Same shape
	+ * with default_enabled = 0 must start disabled.
	+ */
	+KTEST_FUNC(provider_config_default_enabled)
	+{
	+ struct eventlog_provider p_on, p_off;
	+ struct eventlog_session s_on, s_off;
	+ struct eventlog_provider_config cfg_on = { .default_enabled = 1 };
	+ struct eventlog_provider_config cfg_off = { .default_enabled = 0 };
	+
	+ KTEST_LOG(ctx, "cfg.default_enabled controls session start state");
	+
	+ p_on = eventlog_provider_create("test_cfg_def_on", &cfg_on);
	+ KTEST_NEQUAL(p_on, NULL);
	+ KTEST_EQUAL(eventlog_provider_get_default(p_on), 1);
	+
	+ p_off = eventlog_provider_create("test_cfg_def_off", &cfg_off);
	+ KTEST_NEQUAL(p_off, NULL);
	+ KTEST_EQUAL(eventlog_provider_get_default(p_off), 0);
	+
	+ s_on = eventlog_session_create(p_on, 0, true, NULL, 0);
	+ KTEST_NEQUAL(s_on, NULL);
	+ KTEST_EQUAL(eventlog_session_is_enabled(s_on), 1);
	+
	+ s_off = eventlog_session_create(p_off, 0, true, NULL, 0);
	+ KTEST_NEQUAL(s_off, NULL);
	+ KTEST_EQUAL(eventlog_session_is_enabled(s_off), 0);
	+
	+ eventlog_session_destroy(s_on);
	+ eventlog_session_destroy(s_off);
	+ eventlog_provider_destroy(p_on);
	+ eventlog_provider_destroy(p_off);
	+ return (0);
	+}
	+
	+static const struct ktest_test_info tests[] = {
	+ KTEST_INFO(provider_init_cleanup),
	+ KTEST_INFO(session_create_destroy),
	+ KTEST_INFO(event_logging_basic),
	+ KTEST_INFO(event_logging_multiple),
	+ KTEST_INFO(provider_independence),
	+ KTEST_INFO(event_data_integrity),
	+ KTEST_INFO(event_size_variations),
	+ KTEST_INFO(multithreaded_logging),
	+ KTEST_INFO(subscriber_create_destroy),
	+ KTEST_INFO(subscriber_create_device_invalid_size),
	+ KTEST_INFO(subscriber_add_subscription_nonexistent_provider),
	+ KTEST_INFO(subscriber_read_error_paths),
	+ KTEST_INFO(null_pointer_destroy),
	+ KTEST_INFO(subscriber_level_keyword_filtering),
	+ KTEST_INFO(event_oversized_dropped),
	+ KTEST_INFO(event_edge_cases_payload_session),
	+ KTEST_INFO(subscriber_subscription_update_in_place),
	+ KTEST_INFO(subscriber_multiple_subscribers),
	+ KTEST_INFO(subscriber_provider_enablement_aggregation),
	+ KTEST_INFO(subscriber_device_buffer),
	+ KTEST_INFO(subscriber_circular_buffer),
	+ KTEST_INFO(subscriber_double_buffer_race),
	+ KTEST_INFO(subscriber_mid_read_swap),
	+ KTEST_INFO(subscriber_buffer_boundary_stress),
	+ KTEST_INFO(subscriber_buffer_fill_to_capacity),
	+ KTEST_INFO(subscriber_rapid_swap_stress),
	+ KTEST_INFO(subscriber_callback),
	+ KTEST_INFO(schema_generated_macros),
	+ KTEST_INFO(schema_varlen_event),
	+ KTEST_INFO(event_write_gather),
	+ KTEST_INFO(lockfree_many_concurrent_writers),
	+ KTEST_INFO(lockfree_writer_swap_contention),
	+ KTEST_INFO(lockfree_buffer_full_contention),
	+ KTEST_INFO(lockfree_data_integrity_under_contention),
	+ KTEST_INFO(lockfree_reader_writer_swap_race),
	+ KTEST_INFO(timestamp_epoch_boundary),
	+ KTEST_INFO(timestamp_epoch_normal_delivery),
	+ KTEST_INFO(timestamp_epoch_small_uio),
	+ KTEST_INFO(dump_state_basic),
	+ KTEST_INFO(dump_state_routing),
	+ KTEST_INFO(dump_state_no_callback),
	+ KTEST_INFO(dump_state_curvnet_not_set),
	+ KTEST_INFO(dump_state_disabled_sessions),
	+ KTEST_INFO(dump_state_async_runs_off_caller_thread),
	+ KTEST_INFO(dump_state_async_subscribe_returns_before_dump),
	+ KTEST_INFO(dump_state_destroy_waits_for_dump),
	+ KTEST_INFO(dump_state_resubscribe_no_refire),
	+ KTEST_INFO(dump_state_emits_dump_complete),
	+ KTEST_INFO(multi_provider_subscribe_enables_all),
	+ KTEST_INFO(multi_provider_events_from_both),
	+ KTEST_INFO(multi_provider_destroy_one),
	+ KTEST_INFO(multi_provider_dump_state),
	+ KTEST_INFO(multi_provider_independent_enablement),
	+ KTEST_INFO(multi_provider_device_subscriber),
	+ KTEST_INFO(subscribers_changed_basic),
	+ KTEST_INFO(subscribers_changed_null_safe),
	+ KTEST_INFO(subscribers_changed_runs_unlocked),
	+ KTEST_INFO(subscribers_changed_concurrent_subunsub),
	+ KTEST_INFO(provider_config_null_equivalent),
	+ KTEST_INFO(provider_config_default_enabled),
	+};
	+
	+KTEST_MODULE_DECLARE(ktest_eventlog, tests);
	+
	diff --git a/sys/modules/ktest/Makefile b/sys/modules/ktest/Makefile
	--- a/sys/modules/ktest/Makefile
	+++ b/sys/modules/ktest/Makefile
	@@ -1,4 +1,5 @@
	SUBDIR= ktest \
	+ ktest_eventlog \
	ktest_example \
	ktest_netlink_message_writer \
	ktest_tcphpts
	diff --git a/sys/modules/ktest/ktest_eventlog/Makefile b/sys/modules/ktest/ktest_eventlog/Makefile
	new file mode 100644
	--- /dev/null
	+++ b/sys/modules/ktest/ktest_eventlog/Makefile
	@@ -0,0 +1,15 @@
	+PACKAGE= tests
	+WARNS?= 6
	+
	+SYSDIR?=${SRCTOP}/sys
	+.include "${SYSDIR}/conf/kern.opts.mk"
	+
	+.PATH: ${SYSDIR}/kern
	+
	+KMOD= ktest_eventlog
	+SRCS= kern_eventlog_test.c
	+
	+EVENTLOG_SCHEMA= test_eventlog_schema.src
	+
	+.include <bsd.kmod.mk>
	+
	diff --git a/sys/sys/eventlog.h b/sys/sys/eventlog.h
	new file mode 100644
	--- /dev/null
	+++ b/sys/sys/eventlog.h
	@@ -0,0 +1,237 @@
	+/*
	+ * Copyright (c) 2026 Netflix, Inc.
	+ *
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ */
	+
	+#ifndef _SYS_EVENTLOG_H_
	+#define _SYS_EVENTLOG_H_
	+
	+#include <sys/types.h>
	+#include <sys/cdefs.h>
	+
	+/* Maximum provider name length */
	+#define EVENTLOG_PROVIDER_NAME_MAX 32
	+
	+/*
	+ * Keyword for session lifecycle events (reserved; provider schemas use
	+ * KEYWORD SESSION 1).
	+ */
	+#define EVENTLOG_KEYWORD_SESSION 0x80000000
	+
	+/*
	+ * Reserved event IDs (all providers). SESSION_CREATE / SESSION_END mark
	+ * each session's lifetime; DUMP_COMPLETE is synthesised once per
	+ * (subscriber, provider) at the end of an async dump_state replay,
	+ * with session_id == EVENTLOG_SESSION_ID_NONE.
	+ */
	+#define EVENTLOG_SESSION_END_ID ((uint32_t)-1) /* UINT32_MAX */
	+#define EVENTLOG_SESSION_CREATE_ID ((uint32_t)-2) /* UINT32_MAX - 1 */
	+#define EVENTLOG_DUMP_COMPLETE_ID ((uint32_t)-3) /* UINT32_MAX - 2 */
	+
	+/* Sentinel session_id for framework events not tied to a session. */
	+#define EVENTLOG_SESSION_ID_NONE ((uint64_t)-1) /* UINT64_MAX */
	+
	+/* Event log levels */
	+enum eventlog_level {
	+ EVENTLOG_LEVEL_NONE,
	+ EVENTLOG_LEVEL_ERROR,
	+ EVENTLOG_LEVEL_WARN,
	+ EVENTLOG_LEVEL_INFO,
	+ EVENTLOG_LEVEL_VERBOSE,
	+ EVENTLOG_LEVEL_TRACE
	+};
	+
	+#ifdef _KERNEL
	+
	+#include <sys/queue.h>
	+#include <sys/sysctl.h>
	+#include <vm/uma.h>
	+#include <sys/mutex.h>
	+#include <machine/atomic.h>
	+
	+/* Event log provider structure */
	+struct eventlog_provider;
	+
	+/* Session with exposed level/keywords for _ENABLED checks */
	+#ifndef EVENTLOG_INTERNAL
	+struct eventlog_session {
	+ enum eventlog_level effective_level; /* Cached for _ENABLED macro */
	+ uint32_t effective_keywords; /* Cached for _ENABLED macro */
	+};
	+#else
	+struct eventlog_session; /* Full definition in kern_eventlog.c */
	+#endif
	+
	+/*
	+ * Optional callback invoked when a subscriber subscribes with
	+ * EVENTLOG_SUBSCRIPTION_DUMP_STATE. The provider should emit current
	+ * state for all its sessions using the normal event write APIs; the
	+ * framework routes those writes to the requesting subscriber only.
	+ *
	+ * Runs asynchronously on the eventlog dump taskqueue after the
	+ * subscribe call has returned. The taskqueue is single-threaded, so
	+ * concurrent invocations of the same callback never overlap. Callers
	+ * that need to observe the post-dump state can call
	+ * eventlog_subscriber_drain_dumps().
	+ */
	+typedef void (*eventlog_provider_dump_state_t)(
	+ struct eventlog_provider provider, void arg);
	+
	+/*
	+ * Optional callback invoked when the provider's default_enabled sysctl changes.
	+ * value is the raw sysctl value: 0, 1, -1 (disable all then set 0),
	+ * or 2 (enable all then set 1).
	+ * When value is -1 or 2, the framework does NOT iterate sessions for this
	+ * provider; the callback is responsible for enabling/disabling sessions itself.
	+ * When value is 0 or 1, this is informational only (default changed).
	+ */
	+typedef void (*eventlog_default_changed_t)(
	+ struct eventlog_provider provider, int value, void arg);
	+
	+/*
	+ * Optional callback invoked when a provider transitions between "no
	+ * subscribers" and "at least one subscriber". has_subscribers is the
	+ * new state. Useful for gating expensive setup that only needs to run
	+ * while a consumer is listening. May sleep and take other locks; must
	+ * not re-enter the eventlog framework.
	+ */
	+typedef void (*eventlog_subscribers_changed_t)(
	+ struct eventlog_provider provider, bool has_subscribers, void arg);
	+
	+/*
	+ * Optional configuration for eventlog_provider_create. NULL or a
	+ * zero-initialised struct yields no callbacks and disabled-by-default
	+ * sessions. default_enabled seeds kern.eventlog.<name>.default; an
	+ * explicit tunable still wins.
	+ */
	+struct eventlog_provider_config {
	+ eventlog_provider_dump_state_t dump_callback;
	+ void *dump_callback_arg;
	+ eventlog_default_changed_t default_changed;
	+ void *default_changed_arg;
	+ eventlog_subscribers_changed_t subscribers_changed;
	+ void *subscribers_changed_arg;
	+ int default_enabled;
	+};
	+
	+/*
	+ * Create and register a new eventlog provider.
	+ * config: Optional; NULL is equivalent to a zero-initialised config.
	+ */
	+struct eventlog_provider eventlog_provider_create(const char name,
	+ const struct eventlog_provider_config *config);
	+
	+/*
	+ * Unregister and destroy an eventlog provider.
	+ */
	+void eventlog_provider_destroy(struct eventlog_provider *provider);
	+
	+/*
	+ * Query provider level and keywords (for testing/debugging).
	+ */
	+enum eventlog_level eventlog_provider_get_level(
	+ struct eventlog_provider *provider);
	+uint32_t eventlog_provider_get_keywords(struct eventlog_provider *provider);
	+
	+/*
	+ * Query the provider's default_enabled setting (from
	+ * kern.eventlog.<name>.default). Returns 0 (sessions start disabled) or 1
	+ * (sessions start enabled).
	+ */
	+int eventlog_provider_get_default(struct eventlog_provider *provider);
	+
	+/*
	+ * Set the provider's default_enabled value programmatically. This does NOT
	+ * iterate existing sessions; only affects future session creates.
	+ */
	+void eventlog_provider_set_default(struct eventlog_provider *provider,
	+ int value);
	+
	+/*
	+ * Return the provider's auto-generated kern.eventlog.<name> sysctl node and
	+ * its context list. Providers may attach children (e.g. kern.eventlog.cpu.hz);
	+ * the framework owns the storage so children must not outlive the provider.
	+ */
	+struct sysctl_oid;
	+struct sysctl_ctx_list;
	+struct sysctl_oid *eventlog_provider_get_sysctl_node(
	+ struct eventlog_provider *provider);
	+struct sysctl_ctx_list *eventlog_provider_get_sysctl_ctx(
	+ struct eventlog_provider *provider);
	+
	+/*
	+ * Create a new eventlog session.
	+ * session_id: Unique identifier (e.g., inp_gencnt for TCP per-connection
	+ * sessions).
	+ * waitok: If true, use M_WAITOK for allocations; else M_NOWAIT.
	+ * create_payload: Optional provider-specific payload for SESSION_CREATE. If
	+ * NULL, uses default (created_at only). Otherwise must match provider's
	+ * SESSION_CREATE struct.
	+ * create_payload_size: Size of create_payload, or 0 if NULL.
	+ *
	+ * The session's initial enabled state is derived from the provider's
	+ * default_enabled sysctl (kern.eventlog.<name>.default). SESSION_CREATE is
	+ * only emitted when enabled.
	+ */
	+struct eventlog_session *eventlog_session_create(
	+ struct eventlog_provider *provider, uint64_t session_id, bool waitok,
	+ void *create_payload, size_t create_payload_size);
	+
	+/*
	+ * Destroy an eventlog session.
	+ */
	+void eventlog_session_destroy(struct eventlog_session *session);
	+
	+/*
	+ * Enable or disable a session. When disabled, effective_level is set to
	+ * EVENTLOG_LEVEL_NONE so the _ENABLED check fails. When enabled, effective
	+ * values are restored from provider (or session override).
	+ */
	+void eventlog_session_set_enabled(struct eventlog_session *session,
	+ int enabled);
	+
	+/*
	+ * Returns non-zero if session is enabled, 0 if disabled or NULL.
	+ */
	+int eventlog_session_is_enabled(struct eventlog_session *session);
	+
	+/*
	+ * Set per-session level/keywords override. When set, effective values use
	+ * this instead of provider. Use eventlog_session_set_enabled(s, true) after
	+ * to apply. Level NONE or keywords 0 disables the session.
	+ */
	+void eventlog_session_set_filter(struct eventlog_session *session,
	+ enum eventlog_level level, uint32_t keywords);
	+
	+/*
	+ * Write an event directly to all relevant subscribers.
	+ */
	+void eventlog_event_write(struct eventlog_session *session, uint32_t id,
	+ enum eventlog_level level, uint32_t keywords, void *buffer, size_t length);
	+
	+/*
	+ * Same as eventlog_event_write but use a pre-computed timestamp (microseconds
	+ * since boot). Use when the caller already queried time (e.g.
	+ * session->created_at for SESSION_CREATE).
	+ */
	+void eventlog_event_write_at(struct eventlog_session *session, uint32_t id,
	+ enum eventlog_level level, uint32_t keywords, void *buffer, size_t length,
	+ uint64_t timestamp_us);
	+
	+/*
	+ * Scatter/gather variants. The payload is the concatenation of iovcnt
	+ * iovec entries (zero-length entries and iovcnt == 0 legal). Avoids an
	+ * intermediate copy when the event has a variable-length tail.
	+ */
	+struct iovec;
	+void eventlog_event_write_gather(struct eventlog_session *session, uint32_t id,
	+ enum eventlog_level level, uint32_t keywords,
	+ const struct iovec *iov, int iovcnt);
	+void eventlog_event_write_gather_at(struct eventlog_session *session,
	+ uint32_t id, enum eventlog_level level, uint32_t keywords,
	+ const struct iovec *iov, int iovcnt, uint64_t timestamp_us);
	+
	+#endif /* _KERNEL */
	+
	+#endif /* _SYS_EVENTLOG_H_ */
	diff --git a/sys/sys/eventlog_subscriber.h b/sys/sys/eventlog_subscriber.h
	new file mode 100644
	--- /dev/null
	+++ b/sys/sys/eventlog_subscriber.h
	@@ -0,0 +1,199 @@
	+/*
	+ * Copyright (c) 2026 Netflix, Inc.
	+ *
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ */
	+
	+#ifndef _SYS_EVENTLOG_SUBSCRIBER_H_
	+#define _SYS_EVENTLOG_SUBSCRIBER_H_
	+
	+#include <sys/types.h>
	+#include <sys/cdefs.h>
	+#include <sys/eventlog.h>
	+#include <sys/ioccom.h>
	+
	+/* Event header structure (naturally aligned, 32 bytes) */
	+struct eventlog_event_header {
	+ uint16_t event_length; /* Total size including this header */
	+ uint16_t cpu; /* CPU ID */
	+ uint16_t provider_id; /* Provider's unique ID */
	+ uint16_t RESERVED; /* Write to zero, do not read */
	+ uint64_t timestamp; /* Timestamp in microseconds */
	+ uint64_t session_id; /* Session ID */
	+ uint32_t event_id; /* Event ID */
	+ lwpid_t thread_id; /* Thread ID */
	+};
	+
	+/* Subscriber type enum */
	+enum eventlog_subscriber_type {
	+ EVENTLOG_SUBSCRIBER_TYPE_DEVICE,
	+ EVENTLOG_SUBSCRIBER_TYPE_CALLBACK
	+};
	+
	+/*
	+ * Per-subscription flags. Unknown bits are rejected with EINVAL by
	+ * eventlog_subscriber_add_subscription() so new flags can be added
	+ * later without silent breakage on old kernels.
	+ *
	+ * EVENTLOG_SUBSCRIPTION_DUMP_STATE: opt in to a replay of the
	+ * provider's current state. The framework enqueues an asynchronous
	+ * dump task for each newly-subscribed provider with a dump_callback;
	+ * events flow on the normal delivery path. See
	+ * eventlog_subscriber_drain_dumps() to wait for completion.
	+ */
	+#define EVENTLOG_SUBSCRIPTION_DUMP_STATE 0x00000001
	+#define EVENTLOG_SUBSCRIPTION_FLAGS_VALID \
	+ (EVENTLOG_SUBSCRIPTION_DUMP_STATE)
	+
	+/* Subscription request structure (for ioctl) */
	+struct eventlog_subscription_req {
	+ enum eventlog_level level;
	+ uint32_t keywords;
	+ uint32_t flags;
	+ char provider_name[EVENTLOG_PROVIDER_NAME_MAX];
	+};
	+
	+/* Per-CPU buffer size limits (30-bit commit_pos in packed_state) */
	+#define EVENTLOG_BUFFER_SIZE_MIN (64 * 1024) /* 64 KB */
	+#define EVENTLOG_BUFFER_SIZE_MAX ((1 << 30) - 1) /* ~1 GB */
	+
	+/* CREATE request: creates subscriber and subscribes to providers */
	+struct eventlog_create_req {
	+ uint32_t buffer_size_per_cpu; /* Buffer size per CPU */
	+ uint32_t count; /* Number of subscriptions */
	+ /* Variable-length array of subscription requests. */
	+ struct eventlog_subscription_req subscriptions[];
	+};
	+
	+/* Stats structure for GET_STATS IOCTL */
	+struct eventlog_stats {
	+ uint64_t dropped_events; /* Events dropped due to buffer full */
	+};
	+
	+/*
	+ * Provider info for GET_PROVIDERS IOCTL - returns subscribed providers with
	+ * their ids.
	+ */
	+#define EVENTLOG_MAX_PROVIDERS 32
	+struct eventlog_provider_info {
	+ uint16_t provider_id;
	+ char name[EVENTLOG_PROVIDER_NAME_MAX];
	+} __packed;
	+struct eventlog_get_providers_resp {
	+ uint32_t count;
	+ struct eventlog_provider_info providers[EVENTLOG_MAX_PROVIDERS];
	+} __packed;
	+
	+/* IOCTL definitions */
	+#define EVENTLOG_IOC_MAGIC 'E'
	+#define EVENTLOG_IOCTL_CREATE_BASE \
	+ _IOW(EVENTLOG_IOC_MAGIC, 1, struct eventlog_create_req)
	+#define EVENTLOG_IOCTL_DESTROY _IO(EVENTLOG_IOC_MAGIC, 2)
	+#define EVENTLOG_IOCTL_GET_STATS \
	+ _IOR(EVENTLOG_IOC_MAGIC, 3, struct eventlog_stats)
	+#define EVENTLOG_IOCTL_GET_PROVIDERS \
	+ _IOR(EVENTLOG_IOC_MAGIC, 4, struct eventlog_get_providers_resp)
	+
	+#define EVENTLOG_IOCTL_CREATE_SIZE(count) \
	+ _IOC_NEWLEN(EVENTLOG_IOCTL_CREATE_BASE, \
	+ __builtin_offsetof(struct eventlog_create_req, subscriptions) + \
	+ (count) * sizeof(struct eventlog_subscription_req))
	+
	+#ifdef _KERNEL
	+
	+#include <sys/conf.h>
	+#include <sys/uio.h>
	+
	+/* Forward declarations */
	+struct eventlog_subscriber;
	+struct eventlog_subscription;
	+
	+/*
	+ * Create a new device-based subscriber with per-CPU buffers.
	+ * buffer_size_per_cpu: Size of buffer to allocate per CPU
	+ * (EVENTLOG_BUFFER_SIZE_MIN to EVENTLOG_BUFFER_SIZE_MAX).
	+ * The subscriber is automatically added to the global subscribers list.
	+ * Returns NULL on failure, subscriber pointer on success.
	+ */
	+struct eventlog_subscriber *eventlog_subscriber_create_device(
	+ uint32_t buffer_size_per_cpu);
	+
	+/*
	+ * Callback function type for callback-based subscribers.
	+ *
	+ * The payload is delivered as a scatter/gather iovec; iovcnt == 1 for
	+ * scalar writes and may be > 1 for variable-length events. Callbacks
	+ * that need a flat payload compact the iov themselves. The iov and
	+ * iov[*].iov_base pointers are only valid for the duration of the call.
	+ *
	+ * Parameters (in order):
	+ * - hdr: Event header
	+ * - provider_name: Provider name string
	+ * - provider_name_len: Length of provider name (excluding null terminator)
	+ * - session_id: Session ID (uint64_t, displayed as decimal)
	+ * - iov, iovcnt: Payload segments. iovcnt == 0 means no payload.
	+ * - payload_size: Sum of iov[*].iov_len (redundant, provided for ease)
	+ * - callback_arg: User-provided callback argument
	+ */
	+typedef void (eventlog_callback_t)(const struct eventlog_event_header hdr,
	+ const char *provider_name, uint8_t provider_name_len, uint64_t session_id,
	+ const struct iovec *iov, int iovcnt, size_t payload_size,
	+ void *callback_arg);
	+
	+/*
	+ * Create a new callback-based subscriber.
	+ * callback: Function to call when events arrive.
	+ * callback_arg: Argument to pass to callback function.
	+ * The subscriber is automatically added to the global subscribers list.
	+ * Returns NULL on failure, subscriber pointer on success.
	+ */
	+struct eventlog_subscriber *eventlog_subscriber_create_callback(
	+ eventlog_callback_t callback, void *callback_arg);
	+
	+/*
	+ * Destroy a subscriber and update provider enablement.
	+ * Removes all subscriptions, drains any in-flight dump_state tasks,
	+ * and frees resources.
	+ */
	+void eventlog_subscriber_destroy(struct eventlog_subscriber *subscriber);
	+
	+/*
	+ * Add a subscription to a subscriber. flags is a bitmask of
	+ * EVENTLOG_SUBSCRIPTION_* values; unknown bits return EINVAL. Pass 0
	+ * for no flags. Returns 0 on success, error code on failure.
	+ */
	+int eventlog_subscriber_add_subscription(struct eventlog_subscriber *subscriber,
	+ const char *provider_name, enum eventlog_level level, uint32_t keywords,
	+ uint32_t flags);
	+
	+/*
	+ * Wait for every dump_state task this subscriber has outstanding
	+ * (queued or running) to finish. Safe to call from any sleepable
	+ * context.
	+ */
	+void eventlog_subscriber_drain_dumps(struct eventlog_subscriber *subscriber);
	+
	+/*
	+ * Read events from a device subscriber's buffer.
	+ * Handles both user-space (UIO_USERSPACE) and kernel (UIO_SYSSPACE) uio.
	+ *
	+ * Parameters:
	+ * - subscriber: The subscriber to read from
	+ * - uio: Scatter/gather I/O structure (must have uio_td set for user space)
	+ * - flags: Read flags (e.g. FNONBLOCK for non-blocking)
	+ *
	+ * Returns 0 on success, or an error code on failure.
	+ */
	+int eventlog_subscriber_read(struct eventlog_subscriber *subscriber,
	+ struct uio *uio, int flags);
	+
	+/*
	+ * Query subscriber statistics.
	+ * Fills stats with current values (e.g. dropped_events).
	+ */
	+void eventlog_subscriber_get_stats(struct eventlog_subscriber *subscriber,
	+ struct eventlog_stats *stats);
	+
	+#endif /* _KERNEL */
	+
	+#endif /* _SYS_EVENTLOG_SUBSCRIBER_H_ */
	diff --git a/targets/pseudo/userland/Makefile.depend b/targets/pseudo/userland/Makefile.depend
	--- a/targets/pseudo/userland/Makefile.depend
	+++ b/targets/pseudo/userland/Makefile.depend
	@@ -16,6 +16,7 @@
	bin/domainname \
	bin/echo \
	bin/ed \
	+ bin/elog \
	bin/expr \
	bin/freebsd-version \
	bin/getfacl \
	diff --git a/tests/sys/kern/Makefile b/tests/sys/kern/Makefile
	--- a/tests/sys/kern/Makefile
	+++ b/tests/sys/kern/Makefile
	@@ -146,6 +146,9 @@
	CFLAGS.subr_unit.c+= -Wno-missing-prototypes
	SRCS.subr_unit_test+= subr_unit.c

	+ATF_TESTS_PYTEST+= kern_eventlog_test.py
	+ATF_TESTS_PYTEST+= elog_test.py
	+
	WARNS?= 3

	TESTS_SUBDIRS+= acct
	diff --git a/tests/sys/kern/elog_test.py b/tests/sys/kern/elog_test.py
	new file mode 100644
	--- /dev/null
	+++ b/tests/sys/kern/elog_test.py
	@@ -0,0 +1,67 @@
	+#
	+# Copyright (c) 2026 Netflix, Inc.
	+#
	+# SPDX-License-Identifier: BSD-2-Clause
	+#
	+
	+"""ATF tests for the elog(1) userspace utility.
	+
	+Smoke tests for the elog binary CLI surface. These catch packaging
	+regressions (missing binary, broken option parser, broken capture-file
	+reader) without requiring /dev/eventlog or any provider to be present.
	+
	+End-to-end coverage of the device interface and individual providers
	+lives with the providers themselves; the framework's own kernel-side
	+tests are in kern_eventlog_test.py.
	+"""
	+
	+import subprocess
	+from pathlib import Path
	+
	+import pytest
	+from atf_python.utils import BaseTest
	+
	+ELOG = "/usr/bin/elog"
	+
	+
	+class TestElogCli(BaseTest):
	+ @pytest.mark.require_progs(["elog"])
	+ def test_help(self):
	+ # usage() in elog.c calls exit(1), so -h is expected to fail.
	+ for flag in ("-h", "--help"):
	+ r = subprocess.run(
	+ [ELOG, flag], capture_output=True, text=True)
	+ assert r.returncode == 1, f"elog {flag} returncode"
	+ assert "usage: elog" in r.stderr, f"elog {flag} stderr"
	+
	+ @pytest.mark.require_progs(["elog"])
	+ def test_no_args(self):
	+ r = subprocess.run([ELOG], capture_output=True, text=True)
	+ assert r.returncode == 1
	+ assert "no subscriptions specified" in r.stderr
	+
	+ @pytest.mark.require_progs(["elog"])
	+ def test_unknown_arg(self):
	+ r = subprocess.run(
	+ [ELOG, "--not-a-real-flag"], capture_output=True, text=True)
	+ assert r.returncode == 1
	+ assert "unknown argument" in r.stderr
	+
	+ @pytest.mark.require_progs(["elog"])
	+ def test_read_missing_file(self, tmp_path):
	+ target = tmp_path / "does-not-exist.elog"
	+ r = subprocess.run(
	+ [ELOG, "-r", str(target)], capture_output=True, text=True)
	+ assert r.returncode == 1
	+ assert "fopen" in r.stderr
	+
	+ @pytest.mark.require_progs(["elog"])
	+ def test_read_invalid_magic(self, tmp_path):
	+ # 64 zero bytes is enough to satisfy the initial header read but
	+ # fails the ELOG_BINARY_MAGIC check.
	+ bogus = tmp_path / "bogus.elog"
	+ bogus.write_bytes(b"\0" * 64)
	+ r = subprocess.run(
	+ [ELOG, "-r", str(bogus)], capture_output=True, text=True)
	+ assert r.returncode == 1
	+ assert "bad magic number" in r.stderr
	diff --git a/tests/sys/kern/kern_eventlog_test.py b/tests/sys/kern/kern_eventlog_test.py
	new file mode 100644
	--- /dev/null
	+++ b/tests/sys/kern/kern_eventlog_test.py
	@@ -0,0 +1,10 @@
	+#
	+# Copyright (c) 2026 Netflix, Inc.
	+#
	+# SPDX-License-Identifier: BSD-2-Clause
	+#
	+
	+from atf_python.ktest import BaseKernelTest
	+
	+class TestKernEventlog(BaseKernelTest):
	+ KTEST_MODULE_NAME = "ktest_eventlog"
	diff --git a/usr.bin/Makefile b/usr.bin/Makefile
	--- a/usr.bin/Makefile
	+++ b/usr.bin/Makefile
	@@ -35,6 +35,7 @@
	du \
	elfctl \
	elfdump \
	+ elog \
	enigma \
	env \
	etdump \
	diff --git a/usr.bin/elog/Makefile b/usr.bin/elog/Makefile
	new file mode 100644
	--- /dev/null
	+++ b/usr.bin/elog/Makefile
	@@ -0,0 +1,63 @@
	+.include <src.opts.mk>
	+
	+PACKAGE=runtime
	+PROG= elog
	+MAN= elog.1
	+LIBADD= z
	+
	+# Schema directory location
	+EVENTLOG_SCHEMA_DIR= ${SRCTOP}/include/eventlog
	+
	+# Find all schema files (handle case where directory doesn't exist yet)
	+EVENTLOG_SCHEMAS!= if [ -d ${EVENTLOG_SCHEMA_DIR} ]; then find ${EVENTLOG_SCHEMA_DIR} -name '_eventlog_schema.src' 2>/dev/null \| sed 's\|./\|\|' \| sort; fi
	+
	+# Output directory for generated consumer headers
	+# Use OBJTOP directly and append sys/include/eventlog
	+# Ensure OBJTOP is treated as absolute path (it should be, but be explicit)
	+EVENTLOG_HEADER_DIR= ${OBJTOP:tA}/sys/include/eventlog
	+EVENTLOG_CONSUMER_HEADER= eventlog_consumer.h
	+
	+# Generate consumer headers for each schema and master header
	+.if !empty(EVENTLOG_SCHEMAS)
	+EVENTLOG_CONSUMER_HEADERS_GEN= ${EVENTLOG_HEADER_DIR}/.consumer_headers_generated
	+CLEANFILES+= ${EVENTLOG_CONSUMER_HEADERS_GEN}
	+# Generate headers immediately when Makefile is parsed (for early availability)
	+.if !make(clean) && !make(cleandir) && !make(clobber)
	+_GEN_CONSUMER_HEADERS!= ${.CURDIR}/gen_eventlog_headers.sh \
	+ ${EVENTLOG_SCHEMA_DIR} \
	+ ${OBJTOP:tA}/sys/include/eventlog \
	+ ${SRCTOP} \
	+ ${SRCTOP}/include/eventlog/eventlog_gen.awk \
	+ ${OBJTOP:tA}/sys/include/eventlog/${EVENTLOG_CONSUMER_HEADER} \|\| true; \
	+ echo "consumer_headers_generated"
	+.endif
	+# Create Make targets as a safety net (immediate generation above should be sufficient)
	+EVENTLOG_SCHEMA_DEPS!= if [ -d ${EVENTLOG_SCHEMA_DIR} ]; then find ${EVENTLOG_SCHEMA_DIR} -name '*_eventlog_schema.src' -type f 2>/dev/null \| sort; fi
	+${EVENTLOG_HEADER_DIR}/${EVENTLOG_CONSUMER_HEADER}: ${SRCTOP}/include/eventlog/eventlog_gen.awk ${.CURDIR}/gen_eventlog_headers.sh ${EVENTLOG_SCHEMA_DEPS}
	+ ${.CURDIR}/gen_eventlog_headers.sh \
	+ ${EVENTLOG_SCHEMA_DIR} \
	+ ${EVENTLOG_HEADER_DIR} \
	+ ${SRCTOP} \
	+ ${SRCTOP}/include/eventlog/eventlog_gen.awk \
	+ ${.TARGET}
	+ @touch ${EVENTLOG_CONSUMER_HEADERS_GEN}
	+
	+CLEANFILES+= ${EVENTLOG_HEADER_DIR}/${EVENTLOG_CONSUMER_HEADER}
	+clean-consumer-headers:
	+ @if [ -d ${EVENTLOG_HEADER_DIR} ]; then \
	+ for schema in ${EVENTLOG_SCHEMAS}; do \
	+ provider=$$(awk '/^PROVIDER/ {print tolower($$2); exit}' ${EVENTLOG_SCHEMA_DIR}/$$schema 2>/dev/null \|\| true); \
	+ if [ -n "$$provider" ]; then \
	+ rm -f ${EVENTLOG_HEADER_DIR}/$${provider}_eventlog_consumer.h; \
	+ fi; \
	+ done; \
	+ fi
	+
	+CFLAGS+= -I${EVENTLOG_HEADER_DIR} -I${SRCTOP}/sys
	+
	+beforebuild: ${EVENTLOG_HEADER_DIR}/${EVENTLOG_CONSUMER_HEADER}
	+elog.o: ${EVENTLOG_HEADER_DIR}/${EVENTLOG_CONSUMER_HEADER}
	+.endif
	+
	+.include <bsd.prog.mk>
	+
	diff --git a/usr.bin/elog/elog.1 b/usr.bin/elog/elog.1
	new file mode 100644
	--- /dev/null
	+++ b/usr.bin/elog/elog.1
	@@ -0,0 +1,293 @@
	+.\"
	+.\" Copyright (c) 2026 Netflix, Inc.
	+.\"
	+.\" SPDX-License-Identifier: BSD-2-Clause
	+.\"
	+.Dd April 27, 2026
	+.Dt ELOG 1
	+.Os
	+.Sh NAME
	+.Nm elog
	+.Nd subscribe to and read events from eventlog device
	+.Sh SYNOPSIS
	+.Nm
	+.Op Fl b Ar size
	+.Op Fl d
	+.Op Fl D
	+.Op Fl e
	+.Op Fl f Ar type
	+.Op Fl n
	+.Op Fl p
	+.Op Fl s
	+.Op Fl t
	+.Op Fl -delta-time
	+.Op Fl -duration Ar seconds
	+.Op Fl o Ar file \| Cm dir= Ns Ar path
	+.Op Fl c Ar provider Op Ar level Op Ar keywords
	+.Op Fl c Ar provider Op Ar level Op Ar keywords
	+.Op ...
	+.Op Fl h
	+.Nm
	+.Fl r Ar file
	+.Op Fl d
	+.Op Fl e
	+.Op Fl n
	+.Op Fl p
	+.Op Fl t
	+.Op Fl -delta-time
	+.Sh DESCRIPTION
	+The
	+.Nm
	+utility subscribes to events from one or more eventlog providers and displays
	+them in a formatted output on standard output or writes them to a file.
	+.Pp
	+The utility opens the single system-wide eventlog device at
	+.Pa /dev/eventlog
	+and sends subscription requests for the specified providers.
	+.Pp
	+The eventlog framework is host-global and is not exposed to jailed
	+processes.
	+The
	+.Nm
	+utility must be run from the host
	+.Pq Va prison0 ;
	+running it from inside a jail fails because
	+.Xr open 2
	+on
	+.Pa /dev/eventlog
	+returns
	+.Er EPERM .
	+.Pp
	+The options are as follows:
	+.Bl -tag -width indent
	+.It Fl c Ar provider Op Ar level Op Ar keywords
	+.It Fl -capture Ar provider Op Ar level Op Ar keywords
	+Subscribe to events from the specified provider.
	+.Bl -tag -width indent
	+.It Ar provider
	+The name of the eventlog provider.
	+This argument is required.
	+.It Ar level
	+Optional log level.
	+Can be specified as a case-insensitive string
	+(NONE, ERROR, WARN, INFO, VERBOSE, TRACE)
	+or as a numeric value (0\(en5).
	+Defaults to VERBOSE if not specified.
	+TRACE captures all events including internal algorithmic details.
	+VERBOSE excludes TRACE-level events.
	+.It Ar keywords
	+Optional keyword filter.
	+Can be specified as a hexadecimal mask (e.g., 0x3F) or as pipe-delimited
	+keyword names (e.g., CC\|RX\|TX).
	+Keyword names are provider-specific and case-insensitive.
	+Defaults to 0xFFFFFFFF (all keywords) if not specified.
	+.El
	+.It Fl b Ar size
	+.It Fl -buffer-size Ar size
	+Set per-CPU buffer size.
	+The size can be specified as bytes, or with K/M/G suffix for Kilobytes,
	+Megabytes, or Gigabytes.
	+Examples: 65536, 64K, 1M, 256K.
	+Valid range: 4KB to 1GB per CPU.
	+Default: 512KB.
	+.It Fl d
	+.It Fl -date
	+Show the full date in timestamps.
	+When enabled, timestamps are displayed in the format
	+.Li YYYY-MM-DD HH:MM:SS.uuuuuu
	+instead of the default
	+.Li HH:MM:SS.uuuuuu .
	+.It Fl D
	+.It Fl -dump-state
	+Request providers dump their current state when subscribing.
	+This causes providers to emit initial state events (e.g., connection parameters,
	+current congestion window) immediately upon subscription.
	+.It Fl e
	+.It Fl -event-name
	+Show the event name (e.g., IN, OUT, SESSION_CREATE) in square brackets
	+after the session ID in each output line.
	+.It Fl n
	+.It Fl -event-number
	+Print a serial event number at the beginning of each output line.
	+Events are numbered starting from 1.
	+.It Fl o Ar file
	+.It Fl -output Ar file
	+Write output to the specified file in binary format.
	+See
	+.Xr elog 5
	+for a description of the binary file format.
	+Uses buffered I/O for efficiency.
	+Standard output always uses formatted text.
	+.It Fl o Cm dir= Ns Ar path
	+Write one binary file per session under
	+.Ar path .
	+Each session gets its own file, named by session ID.
	+The directory is created if it does not exist.
	+.It Fl p
	+.It Fl -providers
	+Print all registered provider names to stderr at the beginning of output,
	+before any events are displayed.
	+.It Fl r Ar file
	+.It Fl -read-binary Ar file
	+Read a binary file created with
	+.Fl o
	+and convert it to formatted text output.
	+See
	+.Xr elog 5
	+for a description of the binary file format.
	+The binary file header includes total event count and dropped-event count.
	+If the filename ends in
	+.Pa .gz ,
	+the file is transparently decompressed using zlib.
	+This mode cannot be used with capture options.
	+.It Fl s
	+.It Fl -stats
	+Print detailed statistics on exit, including provider count, the number
	+of events received, and any dropped events.
	+Statistics are printed to stderr.
	+Without this flag, no statistics are printed.
	+.It Fl t
	+.It Fl -relative-time
	+Show time relative to the first event in the trace.
	+Each output line is prefixed with a relative timestamp in the format
	+.Li +seconds.microseconds
	+(e.g.,
	+.Li +1.234567 ) .
	+.It Fl -delta-time
	+Show time elapsed since the previous event.
	+Each output line is prefixed with a delta timestamp in the format
	+.Li d Ns seconds.microseconds
	+(e.g.,
	+.Li d0.000015 ) .
	+Can be combined with
	+.Fl t .
	+.It Fl -duration Ar seconds
	+Self-exit after
	+.Ar seconds
	+seconds.
	+.Nm
	+raises
	+.Dv SIGALRM
	+internally and takes the same cleanup path as
	+.Dv SIGINT
	+or
	+.Dv SIGTERM :
	+binary output is flushed, the file header is updated with final
	+event/drop counts, and per-session files are closed and renamed.
	+A value of 0 disables the timer (the default), in which case
	+.Nm
	+runs until a signal is received.
	+Intended for scripted captures
	+.Pq Xr oca.py 1 Cm get Cm capture , cron jobs, ...
	+that want a fixed recording window without relying on an external
	+.Xr kill 1 .
	+.It Fl h
	+.It Fl -help
	+Display usage information and exit.
	+.El
	+.Pp
	+Multiple provider subscriptions can be specified by using multiple
	+.Fl c
	+flags.
	+.Pp
	+The utility reads events continuously, blocking when no data is available.
	+Events are displayed with timestamp, CPU, thread ID, provider name, session ID,
	+and formatted event data.
	+When
	+.Fl n , Fl t ,
	+or
	+.Fl -delta-time
	+are enabled, their respective prefixes appear at the beginning of each line
	+before the CPU and thread fields.
	+.Pp
	+Statistics are only printed when the
	+.Fl s
	+flag is specified.
	+When enabled, statistics include the number of providers, events received,
	+and any dropped events.
	+.Sh EXIT STATUS
	+.Ex -std
	+.Sh EXAMPLES
	+Subscribe to all events from a provider at VERBOSE level with all keywords:
	+.Bd -literal -offset indent
	+elog -c provider
	+.Ed
	+.Pp
	+Subscribe to a provider at INFO level with all keywords:
	+.Bd -literal -offset indent
	+elog -c provider INFO
	+.Ed
	+.Pp
	+Subscribe to a provider at INFO level with specific keywords by name:
	+.Bd -literal -offset indent
	+elog -c provider INFO KEYWORD1\|KEYWORD2
	+.Ed
	+.Pp
	+Subscribe to a provider at INFO level with keywords as hex mask:
	+.Bd -literal -offset indent
	+elog -c provider INFO 0x3F
	+.Ed
	+.Pp
	+Subscribe to a provider at TRACE level:
	+.Bd -literal -offset indent
	+elog -c provider TRACE
	+.Ed
	+.Pp
	+Subscribe to multiple providers:
	+.Bd -literal -offset indent
	+elog -c provider1 INFO -c provider2 WARN
	+.Ed
	+.Pp
	+Subscribe using numeric levels:
	+.Bd -literal -offset indent
	+elog -c provider 3 0x3F
	+.Ed
	+.Pp
	+Set buffer size to 4MB per CPU:
	+.Bd -literal -offset indent
	+elog -b 4M -c provider
	+.Ed
	+.Pp
	+Write output to a file (binary format):
	+.Bd -literal -offset indent
	+elog -c provider -o /tmp/events.bin
	+.Ed
	+.Pp
	+Read binary file and convert to text:
	+.Bd -literal -offset indent
	+elog -r /tmp/events.bin
	+.Ed
	+.Pp
	+Show relative timestamps from trace start:
	+.Bd -literal -offset indent
	+elog -t -c provider
	+.Ed
	+.Pp
	+Show both relative and inter-event delta timestamps:
	+.Bd -literal -offset indent
	+elog -t --delta-time -c provider
	+.Ed
	+.Pp
	+Write per-session files into a directory:
	+.Bd -literal -offset indent
	+elog -o dir=/tmp/traces -c provider
	+.Ed
	+.Pp
	+Print statistics on exit:
	+.Bd -literal -offset indent
	+elog -s -c provider
	+.Ed
	+.Pp
	+Run a fixed-duration capture, suitable for scripted use:
	+.Bd -literal -offset indent
	+elog --duration 30 -c provider1 -c provider2 -o /tmp/cap.elog
	+.Ed
	+.Sh SEE ALSO
	+.Xr elog 5 ,
	+.Xr eventlog 9
	+.Sh HISTORY
	+The
	+.Nm
	+utility first appeared in
	+.Fx 16.0 .
	+
	diff --git a/usr.bin/elog/elog.c b/usr.bin/elog/elog.c
	new file mode 100644
	--- /dev/null
	+++ b/usr.bin/elog/elog.c
	@@ -0,0 +1,1278 @@
	+/*
	+ * Copyright (c) 2026 Netflix, Inc.
	+ *
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ */
	+
	+#include <sys/param.h>
	+#include <sys/types.h>
	+#include <sys/eventlog.h>
	+#include <sys/eventlog_subscriber.h>
	+#include <sys/queue.h>
	+#include <sys/stat.h>
	+#include <sys/ioctl.h>
	+#include <sys/ioccom.h>
	+#include <fcntl.h>
	+#include <limits.h>
	+#include <stdio.h>
	+#include <stdlib.h>
	+#include <unistd.h>
	+#include <string.h>
	+#include <err.h>
	+#include <errno.h>
	+#include <stdint.h>
	+#include <stdbool.h>
	+#include <sys/time.h>
	+#include <time.h>
	+#include <signal.h>
	+#include <zlib.h>
	+
	+/* Include consumer header for formatting events */
	+/* Generated headers are in the build directory */
	+#include "eventlog_consumer.h"
	+
	+struct subscription {
	+ char provider_name[EVENTLOG_PROVIDER_NAME_MAX];
	+ enum eventlog_level level;
	+ uint32_t keywords;
	+};
	+
	+static struct subscription *subscriptions = NULL;
	+static int subscription_count = 0;
	+static int subscription_capacity = 0;
	+static uint32_t buffer_size_per_cpu = 512 * 1024; /* Default 512K */
	+/* When > 0, exit cleanly via SIGALRM after this many seconds. */
	+static unsigned int duration_sec = 0;
	+static volatile bool done = false;
	+static int eventlog_fd = -1; /* eventlog device fd (for stats) */
	+static volatile bool stats_printed = false;
	+static uint64_t events_received = 0;
	+static bool verbose_stats = false; /* Print detailed stats on exit */
	+static const char *binary_input_file = NULL;
	+static uint64_t last_dropped_events = 0; /* From last GET_STATS */
	+
	+/* For read mode: base timestamps from file header for UTC calculation */
	+static uint64_t read_capture_start = 0;
	+static uint64_t read_start_utc_us = 0;
	+static bool show_date = false; /* Full date in timestamps */
	+static bool show_event_number = false; /* Print serial number per line */
	+static bool show_providers = false; /* Print provider names at start */
	+static bool show_event_name = false; /* Print event name after sid */
	+static bool show_relative_time = false; /* Time relative to first event */
	+static bool show_delta_time = false; /* Time since previous event */
	+static uint64_t first_event_ts = 0;
	+static uint64_t prev_event_ts = 0;
	+static bool dump_state = false; /* Replay current state on subscribe */
	+static char output_dir = NULL; / If set, one file per session */
	+
	+/* Per-session file state for -o dir= mode */
	+struct session_file {
	+ STAILQ_ENTRY(session_file) link;
	+ char *session_id;
	+ char *filepath;
	+ FILE *fp;
	+ bool header_written;
	+ uint64_t capture_start;
	+ uint64_t start_utc_us;
	+ uint64_t event_count;
	+};
	+static STAILQ_HEAD(, session_file) session_files =
	+ STAILQ_HEAD_INITIALIZER(session_files);
	+/* Binary output state for single-file mode. */
	+static struct session_file single_output;
	+
	+static inline bool
	+binary_output_mode(void)
	+{
	+ return (output_dir != NULL \|\| single_output.fp != NULL);
	+}
	+
	+/* Provider id->name map (from GET_PROVIDERS or file header) */
	+static struct eventlog_provider_info provider_map[EVENTLOG_MAX_PROVIDERS];
	+static uint32_t provider_map_count = 0;
	+
	+static void
	+print_provider_names(void)
	+{
	+ if (!show_providers \|\| provider_map_count == 0)
	+ return;
	+ fprintf(stderr, "[Providers] %u registered:", provider_map_count);
	+ for (uint32_t i = 0; i < provider_map_count; i++)
	+ fprintf(stderr, " %s", provider_map[i].name);
	+ fprintf(stderr, "\n");
	+}
	+
	+static const char *
	+get_provider_name(uint16_t id)
	+{
	+ for (uint32_t i = 0; i < provider_map_count; i++) {
	+ if (provider_map[i].provider_id == id)
	+ return (provider_map[i].name);
	+ }
	+ return ("?");
	+}
	+
	+/* Binary file format structures */
	+#define ELOG_BINARY_MAGIC "ELOG"
	+#define ELOG_BINARY_VERSION 1
	+
	+struct elog_binary_header {
	+ char magic[4]; /* "ELOG" */
	+ uint32_t version; /* File format version */
	+ uint64_t capture_start; /* us since boot at capture start */
	+ uint64_t start_utc_us; /* UTC us at capture start */
	+ uint64_t event_count; /* Total events in file */
	+ uint64_t dropped_events;
	+} __packed;
	+
	+static void
	+usage(void)
	+{
	+ fprintf(stderr,
	+"usage: elog [options]\n"
	+" -c, --capture <provider> [level] [keywords]\n"
	+" Capture events from provider\n"
	+" provider: Provider name\n"
	+" level: NONE/0, ERROR/1, WARN/2, INFO/3,\n"
	+" VERBOSE/4, TRACE/5 (default: VERBOSE)\n"
	+" keywords: Hex (0x3F) or names (CC\|RX\|TX)\n"
	+" (default: 0xFFFFFFFF, all flags)\n"
	+" -b, --buffer-size <size> Set per-CPU buffer size (default: 512K)\n"
	+" Size in bytes or with K/M/G suffix\n"
	+" Valid range: %uKB to %uMB per CPU\n"
	+" --duration <sec> Self-exit after <sec> seconds (SIGALRM).\n"
	+" Same cleanup as SIGINT/SIGTERM. 0 = no timeout.\n"
	+" -d, --date Show full date (YYYY-MM-DD) in timestamps\n"
	+" -e, --event-name Show event name after session ID\n"
	+" -n, --event-number Print event serial number per line\n"
	+" -p, --providers Print provider names at start of output\n"
	+" -s, --stats Print detailed statistics on exit\n"
	+" -o, --output <file> Write binary output to file (default: stdout)\n"
	+" -o dir=<path> Write one binary file per session under <path>\n"
	+" -r, --read-binary <file>\n"
	+" Read binary file and convert to text (.gz ok)\n"
	+" -t, --relative-time Show time relative to first event\n"
	+" --delta-time Show time since previous event\n"
	+" -D, --dump-state Request providers to replay current state\n"
	+"\n"
	+" Multiple captures can be specified:\n"
	+" elog -c provider\n"
	+" elog -c provider INFO\n"
	+" elog -c provider INFO 0x3F\n"
	+" elog -c provider1 -c provider2 WARN\n"
	+" elog -c provider -o /tmp/events.bin\n"
	+" elog -r /tmp/events.bin\n",
	+ EVENTLOG_BUFFER_SIZE_MIN / 1024,
	+ EVENTLOG_BUFFER_SIZE_MAX / (1024 * 1024));
	+ exit(1);
	+}
	+
	+static bool
	+try_parse_level(const char str, enum eventlog_level out)
	+{
	+ long num;
	+ char *endptr;
	+
	+ static const struct {
	+ const char *name;
	+ enum eventlog_level level;
	+ } levels[] = {
	+ { "NONE", EVENTLOG_LEVEL_NONE },
	+ { "ERROR", EVENTLOG_LEVEL_ERROR },
	+ { "WARN", EVENTLOG_LEVEL_WARN },
	+ { "INFO", EVENTLOG_LEVEL_INFO },
	+ { "VERBOSE", EVENTLOG_LEVEL_VERBOSE },
	+ { "TRACE", EVENTLOG_LEVEL_TRACE },
	+ };
	+
	+ for (size_t i = 0; i < nitems(levels); i++) {
	+ if (strcasecmp(str, levels[i].name) == 0) {
	+ *out = levels[i].level;
	+ return (true);
	+ }
	+ }
	+
	+ num = strtol(str, &endptr, 10);
	+ if (*endptr == '\0' && num >= EVENTLOG_LEVEL_NONE &&
	+ num <= EVENTLOG_LEVEL_TRACE) {
	+ *out = (enum eventlog_level)num;
	+ return (true);
	+ }
	+ return (false);
	+}
	+
	+static bool
	+try_parse_keywords(const char provider, const char str, uint32_t *out)
	+{
	+ char copy, token, *saveptr;
	+ uint32_t result, kw;
	+
	+ if (strncmp(str, "0x", 2) == 0 \|\| strncmp(str, "0X", 2) == 0) {
	+ *out = (uint32_t)strtoul(str, NULL, 0);
	+ return (true);
	+ }
	+
	+ copy = strdup(str);
	+ if (copy == NULL)
	+ return (false);
	+
	+ result = 0;
	+ token = strtok_r(copy, "\|", &saveptr);
	+ while (token != NULL) {
	+ kw = eventlog_keyword_from_string(provider, token);
	+ if (kw == 0) {
	+ free(copy);
	+ return (false);
	+ }
	+ result \|= kw;
	+ token = strtok_r(NULL, "\|", &saveptr);
	+ }
	+ free(copy);
	+
	+ if (result == 0)
	+ return (false);
	+
	+ *out = result;
	+ return (true);
	+}
	+
	+static size_t
	+parse_size(const char *size_str)
	+{
	+ char *endptr;
	+ unsigned long long size;
	+ char unit;
	+
	+ size = strtoull(size_str, &endptr, 0);
	+ if (endptr == size_str)
	+ errx(1, "invalid buffer size: %s", size_str);
	+
	+ /* Skip whitespace */
	+ while (endptr == ' ' \|\| endptr == '\t')
	+ endptr++;
	+
	+ /* Check for unit suffix */
	+ unit = *endptr;
	+ if (unit != '\0') {
	+ endptr++; /* Skip the unit character */
	+ /* Check for any remaining characters */
	+ while (endptr == ' ' \|\| endptr == '\t')
	+ endptr++;
	+ if (*endptr != '\0')
	+ errx(1,
	+ "invalid buffer size: trailing characters after unit");
	+
	+ switch (unit) {
	+ case 'K':
	+ case 'k':
	+ size *= 1024;
	+ break;
	+ case 'M':
	+ case 'm':
	+ size = 1024 1024;
	+ break;
	+ case 'G':
	+ case 'g':
	+ size = 1024 1024 * 1024;
	+ break;
	+ default:
	+ errx(1, "invalid buffer size unit: %c (use K, M, or G)",
	+ unit);
	+ }
	+ }
	+
	+ if (size < EVENTLOG_BUFFER_SIZE_MIN)
	+ errx(1, "buffer size too small: minimum is %u bytes",
	+ EVENTLOG_BUFFER_SIZE_MIN);
	+ if (size > EVENTLOG_BUFFER_SIZE_MAX)
	+ errx(1, "buffer size too large: maximum is %u bytes",
	+ EVENTLOG_BUFFER_SIZE_MAX);
	+
	+ return ((size_t)size);
	+}
	+
	+/*
	+ * Format timestamp. If base_ts and base_utc_us are set (e.g. from file header),
	+ * computes UTC. With show_date, formats as YYYY-MM-DD HH:MM:SS.uuuuuu;
	+ * otherwise just HH:MM:SS.uuuuuu. Falls back to uptime HH:MM:SS.uuuuuu.
	+ */
	+static void
	+format_timestamp(uint64_t us, char *buf, size_t bufsize,
	+ uint64_t base_ts, uint64_t base_utc_us)
	+{
	+ if (base_utc_us != 0) {
	+ int64_t delta = (int64_t)us - (int64_t)base_ts;
	+ uint64_t utc_us = (uint64_t)((int64_t)base_utc_us + delta);
	+ time_t sec = (time_t)(utc_us / 1000000);
	+ unsigned long usec = (unsigned long)(utc_us % 1000000);
	+ struct tm *tm = gmtime(&sec);
	+ if (tm != NULL) {
	+ if (show_date)
	+ snprintf(buf, bufsize,
	+ "%04d-%02d-%02d %02d:%02d:%02d.%06lu",
	+ tm->tm_year + 1900, tm->tm_mon + 1,
	+ tm->tm_mday, tm->tm_hour, tm->tm_min,
	+ tm->tm_sec, usec);
	+ else
	+ snprintf(buf, bufsize, "%02d:%02d:%02d.%06lu",
	+ tm->tm_hour, tm->tm_min, tm->tm_sec,
	+ usec);
	+ return;
	+ }
	+ }
	+ /* Fallback: uptime format */
	+ {
	+ uint64_t seconds = us / 1000000;
	+ uint64_t microseconds = us % 1000000;
	+ uint64_t hours = seconds / 3600;
	+ uint64_t minutes = (seconds % 3600) / 60;
	+ uint64_t secs = seconds % 60;
	+ snprintf(buf, bufsize, "%02llu:%02llu:%02llu.%06llu",
	+ (unsigned long long)hours,
	+ (unsigned long long)minutes,
	+ (unsigned long long)secs,
	+ (unsigned long long)microseconds);
	+ }
	+}
	+
	+/* Forward declarations */
	+static void write_binary_header_to_file(FILE *fp, uint64_t capture_start_time,
	+ uint64_t start_utc_time_us);
	+static size_t parse_and_print_events(const unsigned char *data, size_t len);
	+
	+/*
	+ * Get or create the output file for a session when using -o dir= mode.
	+ * Returns NULL if output_dir is not set (single-file mode).
	+ */
	+#define SESSION_ID_STR_MAX 32
	+
	+static FILE *
	+get_session_output_file(const char *session_id)
	+{
	+ struct session_file *sf;
	+ char sanitized[SESSION_ID_STR_MAX];
	+ char fullpath[PATH_MAX];
	+ size_t i, j;
	+
	+ if (output_dir == NULL)
	+ return (NULL);
	+
	+ STAILQ_FOREACH(sf, &session_files, link) {
	+ if (strcmp(sf->session_id, session_id) == 0)
	+ return (sf->fp);
	+ }
	+
	+ for (i = 0, j = 0;
	+ session_id[i] != '\0' && j < (sizeof(sanitized) - 1); i++) {
	+ char c = session_id[i];
	+ if ((c >= 'a' && c <= 'z') \|\| (c >= 'A' && c <= 'Z') \|\|
	+ (c >= '0' && c <= '9') \|\| c == '-' \|\| c == '_')
	+ sanitized[j++] = c;
	+ else if (c == '/' \|\| c == '\\')
	+ sanitized[j++] = '_';
	+ }
	+ sanitized[j] = '\0';
	+ if (j == 0)
	+ snprintf(sanitized, sizeof(sanitized), "global");
	+
	+ sf = malloc(sizeof(*sf));
	+ if (sf == NULL)
	+ err(1, "malloc(session_file)");
	+ sf->session_id = strdup(session_id);
	+ if (sf->session_id == NULL)
	+ err(1, "strdup");
	+ if (snprintf(fullpath, sizeof(fullpath), "%s/%s.elog", output_dir,
	+ sanitized) >= (int)sizeof(fullpath))
	+ errx(1, "path too long");
	+ sf->filepath = strdup(fullpath);
	+ if (sf->filepath == NULL)
	+ err(1, "strdup");
	+ sf->fp = fopen(fullpath, "wb");
	+ if (sf->fp == NULL)
	+ err(1, "fopen(%s)", fullpath);
	+ sf->header_written = false;
	+ sf->capture_start = 0;
	+ sf->event_count = 0;
	+ STAILQ_INSERT_TAIL(&session_files, sf, link);
	+ return (sf->fp);
	+}
	+
	+/*
	+ * Find session_file by session_id (for updating header).
	+ */
	+static struct session_file *
	+find_session_file(const char *session_id)
	+{
	+ struct session_file *sf;
	+
	+ STAILQ_FOREACH(sf, &session_files, link) {
	+ if (strcmp(sf->session_id, session_id) == 0)
	+ return (sf);
	+ }
	+ return (NULL);
	+}
	+
	+static void
	+init_binary_header(struct elog_binary_header *hdr, uint64_t capture_start,
	+ uint64_t start_utc_us, uint64_t event_count, uint64_t dropped_events)
	+{
	+ memcpy(hdr->magic, ELOG_BINARY_MAGIC, 4);
	+ hdr->version = ELOG_BINARY_VERSION;
	+ hdr->capture_start = capture_start;
	+ hdr->start_utc_us = start_utc_us;
	+ hdr->event_count = event_count;
	+ hdr->dropped_events = dropped_events;
	+}
	+
	+static void
	+rewrite_binary_header(FILE *fp, uint64_t capture_start,
	+ uint64_t start_utc_us, uint64_t event_count, uint64_t dropped_events)
	+{
	+ struct elog_binary_header hdr;
	+ init_binary_header(&hdr, capture_start, start_utc_us,
	+ event_count, dropped_events);
	+ if (fseek(fp, 0, SEEK_SET) != 0)
	+ err(1, "fseek");
	+ if (fwrite(&hdr, sizeof(hdr), 1, fp) != 1)
	+ err(1, "fwrite(binary header)");
	+}
	+
	+static void
	+close_session_file(const char *session_id)
	+{
	+ struct session_file sf, sf_next;
	+
	+ for (sf = STAILQ_FIRST(&session_files); sf != NULL; sf = sf_next) {
	+ sf_next = STAILQ_NEXT(sf, link);
	+ if (strcmp(sf->session_id, session_id) == 0) {
	+ if (sf->header_written)
	+ rewrite_binary_header(sf->fp, sf->capture_start,
	+ sf->start_utc_us, sf->event_count, 0);
	+ fflush(sf->fp);
	+ fclose(sf->fp);
	+ sf->fp = NULL;
	+ STAILQ_REMOVE(&session_files, sf, session_file, link);
	+ free(sf->filepath);
	+ free(sf->session_id);
	+ free(sf);
	+ return;
	+ }
	+ }
	+}
	+
	+static void
	+write_binary_header_to_file(FILE *fp, uint64_t capture_start_time,
	+ uint64_t start_utc_time_us)
	+{
	+ struct elog_binary_header hdr;
	+ uint32_t i;
	+
	+ init_binary_header(&hdr, capture_start_time, start_utc_time_us, 0, 0);
	+ if (fwrite(&hdr, sizeof(hdr), 1, fp) != 1)
	+ err(1, "fwrite(binary header)");
	+ if (fwrite(&provider_map_count, sizeof(provider_map_count), 1, fp) != 1)
	+ err(1, "fwrite(provider count)");
	+ for (i = 0; i < provider_map_count; i++) {
	+ if (fwrite(&provider_map[i], sizeof(provider_map[i]), 1, fp)
	+ != 1)
	+ err(1, "fwrite(provider)");
	+ }
	+}
	+
	+/*
	+ * Format and print an eventlog event.
	+ */
	+static void
	+write_binary_event(const struct eventlog_event_header *hdr,
	+ const void *payload, size_t payload_size)
	+{
	+ char session_id_str[SESSION_ID_STR_MAX];
	+ FILE *out_fp;
	+ struct session_file *sf;
	+ size_t event_length;
	+
	+ snprintf(session_id_str, sizeof(session_id_str), "%lu",
	+ (unsigned long)hdr->session_id);
	+
	+ if (output_dir != NULL) {
	+ out_fp = get_session_output_file(session_id_str);
	+ sf = find_session_file(session_id_str);
	+ } else {
	+ out_fp = single_output.fp;
	+ sf = &single_output;
	+ }
	+
	+ if (sf != NULL) {
	+ if (!sf->header_written) {
	+ struct timeval tv;
	+ gettimeofday(&tv, NULL);
	+ uint64_t utc_us = (uint64_t)tv.tv_sec * 1000000 +
	+ tv.tv_usec;
	+ write_binary_header_to_file(out_fp, hdr->timestamp,
	+ utc_us);
	+ sf->header_written = true;
	+ sf->capture_start = hdr->timestamp;
	+ sf->start_utc_us = utc_us;
	+ }
	+ sf->event_count++;
	+ }
	+
	+ event_length = sizeof(struct eventlog_event_header) + payload_size;
	+ if (event_length > UINT16_MAX)
	+ errx(1, "Event too large for binary format: %zu bytes",
	+ event_length);
	+
	+ struct eventlog_event_header hdr_copy = *hdr;
	+ hdr_copy.event_length = (uint16_t)event_length;
	+ if (fwrite(&hdr_copy, sizeof(struct eventlog_event_header), 1, out_fp)
	+ != 1)
	+ err(1, "fwrite(event header)");
	+ if (payload_size > 0 && fwrite(payload, payload_size, 1, out_fp) != 1)
	+ err(1, "fwrite(payload)");
	+
	+ if (output_dir != NULL) {
	+ if (eventlog_is_session_end(NULL, hdr->event_id))
	+ close_session_file(session_id_str);
	+ }
	+}
	+
	+static void
	+print_eventlog_event(const struct eventlog_event_header *hdr,
	+ const void *payload, size_t payload_size)
	+{
	+ char log_line[2048];
	+ char formatted_buf[1024];
	+ char session_id_str[SESSION_ID_STR_MAX];
	+ char timestamp_str[32];
	+ char event_name_buf[64];
	+ char event_num_buf[32];
	+ char relative_buf[32];
	+ char delta_buf[32];
	+ const char *provider_name;
	+ int formatted_len;
	+
	+ if (binary_output_mode()) {
	+ write_binary_event(hdr, payload, payload_size);
	+ return;
	+ }
	+
	+ snprintf(session_id_str, sizeof(session_id_str), "%lu",
	+ (unsigned long)hdr->session_id);
	+ provider_name = get_provider_name(hdr->provider_id);
	+
	+ format_timestamp(hdr->timestamp, timestamp_str, sizeof(timestamp_str),
	+ read_capture_start, read_start_utc_us);
	+
	+ relative_buf[0] = '\0';
	+ if (show_relative_time) {
	+ if (first_event_ts == 0)
	+ first_event_ts = hdr->timestamp;
	+ uint64_t rel = hdr->timestamp - first_event_ts;
	+ snprintf(relative_buf, sizeof(relative_buf),
	+ "+%llu.%06llu ",
	+ (unsigned long long)(rel / 1000000),
	+ (unsigned long long)(rel % 1000000));
	+ }
	+
	+ delta_buf[0] = '\0';
	+ if (show_delta_time) {
	+ uint64_t delta = 0;
	+ if (prev_event_ts != 0)
	+ delta = hdr->timestamp - prev_event_ts;
	+ snprintf(delta_buf, sizeof(delta_buf),
	+ "d%llu.%06llu ",
	+ (unsigned long long)(delta / 1000000),
	+ (unsigned long long)(delta % 1000000));
	+ }
	+ prev_event_ts = hdr->timestamp;
	+
	+ formatted_len = eventlog_format_payload(
	+ provider_name, payload, payload_size,
	+ hdr->event_id, formatted_buf, sizeof(formatted_buf));
	+ if (formatted_len <= 0)
	+ snprintf(formatted_buf, sizeof(formatted_buf),
	+ "[UNKNOWN_EVENT_ID:%u]", hdr->event_id);
	+
	+ event_num_buf[0] = '\0';
	+ if (show_event_number)
	+ snprintf(event_num_buf, sizeof(event_num_buf),
	+ "%-8llu ", (unsigned long long)(events_received + 1));
	+
	+ event_name_buf[0] = '\0';
	+ if (show_event_name) {
	+ const char *name = eventlog_event_id_to_name(
	+ provider_name, hdr->event_id);
	+ if (name != NULL)
	+ snprintf(event_name_buf, sizeof(event_name_buf),
	+ "[%s]", name);
	+ else
	+ snprintf(event_name_buf, sizeof(event_name_buf),
	+ "[?%u]", hdr->event_id);
	+ }
	+
	+ snprintf(log_line, sizeof(log_line),
	+ "%s%s%s[%2u]%04x::%s [%s][%s]%s %s\n",
	+ event_num_buf,
	+ relative_buf,
	+ delta_buf,
	+ hdr->cpu,
	+ (unsigned int)hdr->thread_id,
	+ timestamp_str,
	+ provider_name,
	+ session_id_str,
	+ event_name_buf,
	+ formatted_buf);
	+
	+ fputs(log_line, stdout);
	+}
	+
	+static void
	+update_binary_header(void)
	+{
	+ struct session_file *sf;
	+
	+ if (output_dir != NULL) {
	+ STAILQ_FOREACH(sf, &session_files, link) {
	+ if (sf->header_written)
	+ rewrite_binary_header(sf->fp, sf->capture_start,
	+ sf->start_utc_us, sf->event_count, 0);
	+ }
	+ return;
	+ }
	+
	+ if (!single_output.header_written)
	+ return;
	+ rewrite_binary_header(single_output.fp, single_output.capture_start,
	+ single_output.start_utc_us, events_received, last_dropped_events);
	+}
	+
	+static bool
	+has_gz_extension(const char *filename)
	+{
	+ size_t len = strlen(filename);
	+ return (len >= 3 && strcmp(filename + len - 3, ".gz") == 0);
	+}
	+
	+/*
	+ * Thin wrappers to abstract FILE vs gzFile for the read path.
	+ */
	+struct elog_reader {
	+ FILE *fp;
	+ gzFile gz;
	+ bool is_gz;
	+};
	+
	+static void
	+elog_reader_open(struct elog_reader r, const char filename)
	+{
	+ r->is_gz = has_gz_extension(filename);
	+ if (r->is_gz) {
	+ r->fp = NULL;
	+ r->gz = gzopen(filename, "rb");
	+ if (r->gz == NULL)
	+ err(1, "gzopen(%s)", filename);
	+ } else {
	+ r->gz = NULL;
	+ r->fp = fopen(filename, "rb");
	+ if (r->fp == NULL)
	+ err(1, "fopen(%s)", filename);
	+ }
	+}
	+
	+static ssize_t
	+elog_reader_read(struct elog_reader r, void buf, size_t len)
	+{
	+ if (r->is_gz) {
	+ int ret = gzread(r->gz, buf, (unsigned)len);
	+ if (ret < 0) {
	+ int errnum;
	+ const char *msg = gzerror(r->gz, &errnum);
	+ errx(1, "gzread: %s", msg);
	+ }
	+ return ((ssize_t)ret);
	+ }
	+ return ((ssize_t)fread(buf, 1, len, r->fp));
	+}
	+
	+static bool
	+elog_reader_eof(struct elog_reader *r)
	+{
	+ if (r->is_gz)
	+ return (gzeof(r->gz) != 0);
	+ return (feof(r->fp) != 0);
	+}
	+
	+static void
	+elog_reader_close(struct elog_reader *r)
	+{
	+ if (r->is_gz)
	+ gzclose(r->gz);
	+ else
	+ fclose(r->fp);
	+}
	+
	+/*
	+ * Read exactly 'len' bytes or fail. Returns true on success, false on EOF
	+ * (partial read at end of file).
	+ */
	+static bool
	+elog_reader_read_exact(struct elog_reader r, void buf, size_t len)
	+{
	+ size_t total = 0;
	+
	+ while (total < len) {
	+ ssize_t n;
	+
	+ n = elog_reader_read(r, (char *)buf + total, len - total);
	+ if (n <= 0) {
	+ if (total == 0)
	+ return (false);
	+ errx(1,
	+ "Unexpected end of file (read %zu of %zu bytes)",
	+ total, len);
	+ }
	+ total += n;
	+ }
	+ return (true);
	+}
	+
	+static int
	+read_binary_file(const char *filename)
	+{
	+ struct elog_reader reader;
	+ struct elog_binary_header file_hdr;
	+ unsigned char *buffer = NULL;
	+ unsigned char *partial_buffer = NULL;
	+ size_t buffer_size = 64 * 1024; /* 64KB chunks */
	+ size_t buffer_used = 0;
	+ size_t buffer_capacity = buffer_size;
	+ size_t partial_size = 0;
	+ ssize_t nread;
	+ size_t consumed;
	+
	+ elog_reader_open(&reader, filename);
	+
	+ /* Read and validate file header */
	+ memset(&file_hdr, 0, sizeof(file_hdr));
	+ if (!elog_reader_read_exact(&reader, &file_hdr, sizeof(file_hdr)))
	+ errx(1, "File is empty");
	+
	+ /* Validate magic number */
	+ if (memcmp(file_hdr.magic, ELOG_BINARY_MAGIC, 4) != 0)
	+ errx(1, "Invalid binary file: bad magic number");
	+
	+ /* Validate version */
	+ if (file_hdr.version != ELOG_BINARY_VERSION)
	+ errx(1, "Unsupported file version: %u (expected %u)",
	+ file_hdr.version, ELOG_BINARY_VERSION);
	+
	+ /*
	+ * Stash for format_timestamp when printing
	+ * (UTC = start_utc_us + (event_ts - capture_start)).
	+ */
	+ read_capture_start = file_hdr.capture_start;
	+ read_start_utc_us = file_hdr.start_utc_us;
	+
	+ /* V2: read provider list for event lookup */
	+ if (!elog_reader_read_exact(&reader, &provider_map_count,
	+ sizeof(provider_map_count)))
	+ err(1, "read(provider count)");
	+ if (provider_map_count > EVENTLOG_MAX_PROVIDERS)
	+ errx(1, "Invalid provider count %u in file",
	+ provider_map_count);
	+ for (uint32_t i = 0; i < provider_map_count; i++) {
	+ if (!elog_reader_read_exact(&reader, &provider_map[i],
	+ sizeof(provider_map[i])))
	+ err(1, "read(provider)");
	+ }
	+
	+ print_provider_names();
	+
	+ /* Allocate buffers for reading events */
	+ buffer = malloc(buffer_capacity);
	+ if (buffer == NULL)
	+ err(1, "malloc(buffer)");
	+ partial_buffer = malloc(buffer_capacity);
	+ if (partial_buffer == NULL)
	+ err(1, "malloc(partial_buffer)");
	+
	+ /*
	+ * Read events in chunks and parse them using the same code as
	+ * kernel events.
	+ */
	+ while (!elog_reader_eof(&reader)) {
	+ /* If we have partial data from previous read, prepend it */
	+ if (partial_size > 0) {
	+ if (partial_size > buffer_capacity) {
	+ /* Partial event too large; likely corrupt. */
	+ errx(1,
	+ "Partial event too large, file may be corrupted");
	+ }
	+ memcpy(buffer, partial_buffer, partial_size);
	+ buffer_used = partial_size;
	+ partial_size = 0;
	+ } else {
	+ buffer_used = 0;
	+ }
	+
	+ /* Read more data into buffer */
	+ nread = elog_reader_read(&reader, buffer + buffer_used,
	+ buffer_capacity - buffer_used);
	+ if (nread == 0 && elog_reader_eof(&reader)) {
	+ /* EOF - parse any remaining data */
	+ if (buffer_used > 0)
	+ parse_and_print_events(buffer, buffer_used);
	+ break;
	+ }
	+
	+ buffer_used += nread;
	+
	+ /* Parse events - returns number of bytes consumed */
	+ consumed = parse_and_print_events(buffer, buffer_used);
	+
	+ /* Move any remaining (partial) data to partial buffer */
	+ if (consumed < buffer_used) {
	+ size_t remaining = buffer_used - consumed;
	+ if (remaining > buffer_capacity) {
	+ /* This shouldn't happen, but handle it */
	+ errx(1,
	+ "remaining data larger than buffer capacity");
	+ }
	+ memcpy(partial_buffer, buffer + consumed, remaining);
	+ partial_size = remaining;
	+ }
	+ buffer_used = 0;
	+ }
	+
	+ free(buffer);
	+ free(partial_buffer);
	+ elog_reader_close(&reader);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Parse and print eventlog data.
	+ * Format (V1):
	+ * Each event: eventlog_event_header (includes provider_id, session_id,
	+ * event_id) + payload.
	+ * event_length = sizeof(header) + payload_size
	+ * Multiple events can be present in a single buffer read.
	+ * Returns the number of bytes consumed (complete events processed).
	+ * Requires provider_map to be populated (from GET_PROVIDERS or file header).
	+ */
	+static size_t
	+parse_and_print_events(const unsigned char *data, size_t len)
	+{
	+ const unsigned char *buf = data;
	+ const unsigned char *end = data + len;
	+ const unsigned char *start = data;
	+
	+ while (buf < end) {
	+ struct eventlog_event_header hdr;
	+ size_t event_payload_len;
	+ const unsigned char *event_start = buf;
	+
	+ if (buf + sizeof(struct eventlog_event_header) > end)
	+ break;
	+ memcpy(&hdr, buf, sizeof(struct eventlog_event_header));
	+
	+ if (hdr.event_length < sizeof(struct eventlog_event_header)) {
	+ fprintf(stderr,
	+ "Error: invalid event_length %u at offset %zu\n",
	+ hdr.event_length, (size_t)(buf - start));
	+ buf += sizeof(struct eventlog_event_header);
	+ return (buf - start);
	+ }
	+ if (event_start + hdr.event_length > end)
	+ break;
	+
	+ buf += sizeof(struct eventlog_event_header);
	+ event_payload_len = hdr.event_length -
	+ sizeof(struct eventlog_event_header);
	+ print_eventlog_event(&hdr, buf, event_payload_len);
	+ events_received++;
	+ buf = event_start + hdr.event_length;
	+ }
	+ return (buf - start);
	+}
	+
	+static void
	+print_stats(void)
	+{
	+ /* Prevent double-printing */
	+ if (stats_printed)
	+ return;
	+ stats_printed = true;
	+
	+ /* Always query kernel stats for binary header update */
	+ if (eventlog_fd >= 0) {
	+ struct eventlog_stats stats;
	+ if (ioctl(eventlog_fd, EVENTLOG_IOCTL_GET_STATS, &stats) == 0)
	+ last_dropped_events = stats.dropped_events;
	+ }
	+
	+ if (!verbose_stats)
	+ return;
	+
	+ fprintf(stderr, "\n[Stats]\n");
	+ fprintf(stderr, " Providers: %u\n", provider_map_count);
	+ fprintf(stderr, " Events received: %llu\n",
	+ (unsigned long long)events_received);
	+ if (last_dropped_events > 0)
	+ fprintf(stderr, " Dropped events: %llu\n",
	+ (unsigned long long)last_dropped_events);
	+}
	+
	+static void
	+sigint_handler(int sig __unused)
	+{
	+ struct session_file *sf;
	+
	+ done = true;
	+ /* Stats will be printed by atexit handler or normal exit path */
	+ /* Flush output before exit */
	+ if (output_dir != NULL) {
	+ STAILQ_FOREACH(sf, &session_files, link) {
	+ if (sf->fp != NULL)
	+ fflush(sf->fp);
	+ }
	+ } else if (single_output.fp != NULL) {
	+ fflush(single_output.fp);
	+ } else {
	+ fflush(stdout);
	+ }
	+}
	+
	+/*
	+ * Parse command line arguments and populate global state.
	+ */
	+static bool
	+arg_match(const char arg, const char long_form, const char *short_form)
	+{
	+ return (strcmp(arg, long_form) == 0 \|\|
	+ (short_form != NULL && strcmp(arg, short_form) == 0));
	+}
	+
	+static void
	+parse_arguments(int argc, char *argv[])
	+{
	+ int arg_idx;
	+ const char *arg;
	+
	+ for (arg_idx = 1; arg_idx < argc; arg_idx++) {
	+ arg = argv[arg_idx];
	+ if (arg_match(arg, "--capture", "-c")) {
	+ enum eventlog_level level = EVENTLOG_LEVEL_VERBOSE;
	+ uint32_t keywords = 0xFFFFFFFF;
	+ int next_idx = arg_idx + 1;
	+
	+ if (subscription_count >= subscription_capacity) {
	+ int new_capacity = (subscription_capacity == 0)
	+ ? 16 : subscription_capacity * 2;
	+ struct subscription *new_subscriptions =
	+ realloc(subscriptions, new_capacity *
	+ sizeof(struct subscription));
	+ if (new_subscriptions == NULL)
	+ errx(1,
	+ "failed to allocate subscriptions");
	+ subscriptions = new_subscriptions;
	+ subscription_capacity = new_capacity;
	+ }
	+
	+ if (next_idx >= argc)
	+ errx(1,
	+ "--capture requires at least provider name");
	+
	+ if (strlen(argv[next_idx]) >=
	+ EVENTLOG_PROVIDER_NAME_MAX)
	+ errx(1, "provider name too long");
	+
	+ memset(&subscriptions[subscription_count], 0,
	+ sizeof(subscriptions[0]));
	+ strlcpy(subscriptions[subscription_count].provider_name,
	+ argv[next_idx],
	+ sizeof(subscriptions[0].provider_name));
	+ next_idx++;
	+
	+ if (next_idx < argc &&
	+ try_parse_level(argv[next_idx], &level))
	+ next_idx++;
	+
	+ /*
	+ * Optional keywords (hex 0x prefix or pipe-delimited
	+ * names).
	+ */
	+ if (next_idx < argc &&
	+ try_parse_keywords(
	+ subscriptions[subscription_count].provider_name,
	+ argv[next_idx], &keywords))
	+ next_idx++;
	+
	+ /* Always include SESSION for lifecycle events. */
	+ subscriptions[subscription_count].level = level;
	+ subscriptions[subscription_count].keywords =
	+ keywords \| EVENTLOG_KEYWORD_SESSION;
	+
	+ arg_idx = next_idx - 1;
	+ subscription_count++;
	+ } else if (arg_match(arg, "--buffer-size", "-b")) {
	+ int next_idx = arg_idx + 1;
	+ if (next_idx >= argc)
	+ errx(1, "--buffer-size requires a size value");
	+ buffer_size_per_cpu = parse_size(argv[next_idx]);
	+ arg_idx = next_idx;
	+ } else if (arg_match(arg, "--duration", NULL)) {
	+ int next_idx = arg_idx + 1;
	+ char *endptr;
	+ unsigned long val;
	+
	+ if (next_idx >= argc)
	+ errx(1, "--duration requires a seconds value");
	+ val = strtoul(argv[next_idx], &endptr, 10);
	+ if (argv[next_idx] == '\0' \|\| endptr != '\0')
	+ errx(1, "--duration: not a number: %s",
	+ argv[next_idx]);
	+ if (val > UINT_MAX)
	+ errx(1, "--duration: value too large");
	+ duration_sec = (unsigned int)val;
	+ arg_idx = next_idx;
	+ } else if (arg_match(arg, "--date", "-d")) {
	+ show_date = true;
	+ } else if (arg_match(arg, "--event-name", "-e")) {
	+ show_event_name = true;
	+ } else if (arg_match(arg, "--event-number", "-n")) {
	+ show_event_number = true;
	+ } else if (arg_match(arg, "--providers", "-p")) {
	+ show_providers = true;
	+ } else if (arg_match(arg, "--stats", "-s")) {
	+ verbose_stats = true;
	+ } else if (arg_match(arg, "--output", "-o")) {
	+ int next_idx = arg_idx + 1;
	+ if (next_idx >= argc)
	+ errx(1,
	+ "--output requires a filename or dir=path");
	+ if (strncmp(argv[next_idx], "dir=", 4) == 0) {
	+ output_dir = strdup(argv[next_idx] + 4);
	+ if (output_dir == NULL)
	+ err(1, "strdup");
	+ if (mkdir(output_dir, 0755) != 0 &&
	+ errno != EEXIST)
	+ err(1, "mkdir(%s)", output_dir);
	+ } else {
	+ single_output.fp = fopen(argv[next_idx], "wb");
	+ if (single_output.fp == NULL)
	+ err(1, "fopen(%s)", argv[next_idx]);
	+ }
	+ arg_idx = next_idx;
	+ } else if (arg_match(arg, "--relative-time", "-t")) {
	+ show_relative_time = true;
	+ } else if (arg_match(arg, "--delta-time", NULL)) {
	+ show_delta_time = true;
	+ } else if (arg_match(arg, "--dump-state", "-D")) {
	+ dump_state = true;
	+ } else if (arg_match(arg, "--read-binary", "-r")) {
	+ int next_idx = arg_idx + 1;
	+ if (next_idx >= argc)
	+ errx(1, "--read-binary requires a filename");
	+ binary_input_file = argv[next_idx];
	+ arg_idx = next_idx;
	+ } else if (arg_match(arg, "--help", "-h")) {
	+ usage();
	+ } else {
	+ errx(1, "unknown argument: %s (use --capture or -c)",
	+ arg);
	+ }
	+ }
	+}
	+
	+/*
	+ * Run eventlog device mode - open device, create subscriber, and read events.
	+ */
	+static int
	+run_eventlog_mode(void)
	+{
	+ int fd;
	+ char device_path[] = "/dev/eventlog";
	+ char *buffer;
	+ ssize_t nread;
	+ size_t bufsize = 1024 * 1024;
	+ int i, error;
	+
	+ /* Open device */
	+ fd = open(device_path, O_RDONLY);
	+ if (fd < 0) {
	+ err(1, "open(%s)", device_path);
	+ }
	+
	+ /* Prepare CREATE request with buffer size and subscriptions */
	+ /* Calculate exact size needed */
	+ size_t base_offset = __builtin_offsetof(struct eventlog_create_req,
	+ subscriptions);
	+ size_t sub_size = sizeof(struct eventlog_subscription_req);
	+ size_t req_size = base_offset + subscription_count * sub_size;
	+ struct eventlog_create_req *req;
	+ u_long ioctl_cmd;
	+ size_t ioctl_size;
	+
	+ req = malloc(req_size);
	+ if (req == NULL)
	+ err(1, "malloc");
	+
	+ memset(req, 0, req_size);
	+ req->buffer_size_per_cpu = buffer_size_per_cpu;
	+ req->count = subscription_count;
	+ for (i = 0; i < subscription_count; i++) {
	+ strlcpy(req->subscriptions[i].provider_name,
	+ subscriptions[i].provider_name,
	+ sizeof(req->subscriptions[i].provider_name));
	+ req->subscriptions[i].level = subscriptions[i].level;
	+ req->subscriptions[i].keywords = subscriptions[i].keywords;
	+ req->subscriptions[i].flags = dump_state ?
	+ EVENTLOG_SUBSCRIPTION_DUMP_STATE : 0;
	+ }
	+
	+ /* Calculate ioctl command with exact size */
	+ ioctl_cmd = EVENTLOG_IOCTL_CREATE_SIZE(subscription_count);
	+ ioctl_size = ((ioctl_cmd >> 16) & 0x1fff); /* Extract IOCPARM_LEN */
	+
	+ /* Verify sizes match */
	+ if (ioctl_size != req_size) {
	+ errx(1, "ioctl size calculation error");
	+ }
	+
	+ /* Send CREATE ioctl (creates subscriber and subscribes) */
	+ error = ioctl(fd, ioctl_cmd, req);
	+ if (error != 0) {
	+ err(1, "ioctl(EVENTLOG_IOCTL_CREATE)");
	+ }
	+
	+ free(req);
	+
	+ /* Get provider ids for event lookup (required for new binary format) */
	+ {
	+ struct eventlog_get_providers_resp prov_resp;
	+ memset(&prov_resp, 0, sizeof(prov_resp));
	+ error = ioctl(fd, EVENTLOG_IOCTL_GET_PROVIDERS, &prov_resp);
	+ if (error != 0)
	+ err(1, "ioctl(EVENTLOG_IOCTL_GET_PROVIDERS)");
	+ provider_map_count = prov_resp.count;
	+ memcpy(provider_map, prov_resp.providers,
	+ provider_map_count * sizeof(provider_map[0]));
	+ }
	+
	+ print_provider_names();
	+
	+
	+ /* Allocate buffer */
	+ buffer = malloc(bufsize);
	+ if (buffer == NULL)
	+ err(1, "malloc");
	+
	+ eventlog_fd = fd; /* Store for signal handler */
	+
	+ /* Read and parse events */
	+ while (!done) {
	+ nread = read(fd, buffer, bufsize);
	+ if (nread < 0) {
	+ if (errno == EINTR) {
	+ /* Check if we were interrupted by signal */
	+ if (done)
	+ break;
	+ continue;
	+ }
	+ if (errno == EAGAIN)
	+ continue;
	+ err(1, "read");
	+ }
	+ if (nread == 0) {
	+ /* EOF - wait a bit and retry */
	+ usleep(100000); /* 100ms */
	+ continue;
	+ }
	+
	+ /* Provider name is included in the event format. */
	+ parse_and_print_events((const unsigned char *)buffer, nread);
	+ }
	+
	+ /* Print stats before cleanup */
	+ print_stats();
	+
	+ /* Update binary header with final event/drop counts before closing */
	+ update_binary_header();
	+
	+ /* Cleanup on exit */
	+ close(fd);
	+ eventlog_fd = -1;
	+ free(buffer);
	+ free(subscriptions);
	+ if (output_dir != NULL) {
	+ struct session_file sf, sf_next;
	+ for (sf = STAILQ_FIRST(&session_files); sf != NULL;
	+ sf = sf_next) {
	+ sf_next = STAILQ_NEXT(sf, link);
	+ if (sf->fp != NULL) {
	+ fflush(sf->fp);
	+ fclose(sf->fp);
	+ sf->fp = NULL;
	+ }
	+ free(sf->filepath);
	+ free(sf->session_id);
	+ free(sf);
	+ }
	+ STAILQ_INIT(&session_files);
	+ free(output_dir);
	+ output_dir = NULL;
	+ } else if (single_output.fp != NULL) {
	+ fflush(single_output.fp);
	+ fclose(single_output.fp);
	+ single_output.fp = NULL;
	+ }
	+
	+ return (0);
	+}
	+
	+int
	+main(int argc, char *argv[])
	+{
	+ /* Parse command line arguments */
	+ parse_arguments(argc, argv);
	+
	+ /* Handle binary read mode - this bypasses capture */
	+ if (binary_input_file != NULL) {
	+ if (subscription_count > 0) {
	+ errx(1,
	+ "--read-binary cannot be used with subscribe options");
	+ }
	+ return (read_binary_file(binary_input_file));
	+ }
	+
	+ /* Check if we have any subscriptions */
	+ if (subscription_count == 0)
	+ errx(1, "no subscriptions specified (use --capture or -c)");
	+
	+ /* Register atexit handler to ensure stats are always printed */
	+ atexit(print_stats);
	+
	+ /* Set up signal handlers for cleanup on interrupt. */
	+ struct sigaction sa;
	+ sa.sa_handler = sigint_handler;
	+ sigemptyset(&sa.sa_mask);
	+ sa.sa_flags = 0;
	+ (void)sigaction(SIGINT, &sa, NULL);
	+ (void)sigaction(SIGTERM, &sa, NULL);
	+
	+ /*
	+ * --duration schedules a SIGALRM that uses the same cleanup
	+ * path as SIGINT/SIGTERM (see sigint_handler). The main read
	+ * loop in run_eventlog_mode() checks `done` after EINTR.
	+ */
	+ if (duration_sec > 0) {
	+ (void)sigaction(SIGALRM, &sa, NULL);
	+ alarm(duration_sec);
	+ }
	+
	+ /* Run eventlog device mode */
	+ return (run_eventlog_mode());
	+}
	+
	diff --git a/usr.bin/elog/gen_eventlog_headers.sh b/usr.bin/elog/gen_eventlog_headers.sh
	new file mode 100755
	--- /dev/null
	+++ b/usr.bin/elog/gen_eventlog_headers.sh
	@@ -0,0 +1,193 @@
	+#!/bin/sh
	+#
	+# Copyright (c) 2026 Netflix, Inc.
	+#
	+# SPDX-License-Identifier: BSD-2-Clause
	+#
	+# Generate eventlog consumer headers from schema files. This script
	+# generates both individual provider headers and the master header.
	+#
	+# Usage:
	+# gen_eventlog_headers.sh <schema_dir> <header_dir> <srctop> \
	+# <awk_script> <master_header>
	+#
	+
	+set -e
	+
	+SCHEMA_DIR="$1"
	+HEADER_DIR="$2"
	+SRCTOP="$3"
	+AWK_SCRIPT="$4"
	+MASTER_HEADER="$5"
	+
	+if [ $# -ne 5 ]; then
	+ echo "Usage: $0 <schema_dir> <header_dir> <srctop> <awk_script>" \
	+ "<master_header>" >&2
	+ exit 1
	+fi
	+
	+# Resolve HEADER_DIR to an absolute path BEFORE any cd operations to
	+# avoid creating directories in the source tree.
	+case "${HEADER_DIR}" in
	+ /*)
	+ ABS_HEADER_DIR="${HEADER_DIR}"
	+ ;;
	+ *)
	+ _ORIG_PWD="${PWD}"
	+ if command -v realpath >/dev/null 2>&1; then
	+ ABS_HEADER_DIR="$(realpath -m \
	+ "${_ORIG_PWD}/${HEADER_DIR}")"
	+ else
	+ if [ "${HEADER_DIR}" = "." ]; then
	+ ABS_HEADER_DIR="${_ORIG_PWD}"
	+ elif [ "${HEADER_DIR}" = ".." ]; then
	+ ABS_HEADER_DIR="$(cd "${_ORIG_PWD}/.." \
	+ && pwd)"
	+ else
	+ ABS_HEADER_DIR="${_ORIG_PWD}/${HEADER_DIR}"
	+ fi
	+ fi
	+ ;;
	+esac
	+
	+# Refuse to write inside the source tree.
	+if [ "${ABS_HEADER_DIR#${SRCTOP}/}" != "${ABS_HEADER_DIR}" ]; then
	+ echo "ERROR: Header directory ${ABS_HEADER_DIR} would be" \
	+ "created inside source tree ${SRCTOP}" >&2
	+ exit 1
	+fi
	+
	+mkdir -p "${ABS_HEADER_DIR}"
	+
	+# Print one lower-cased provider name per *_eventlog_schema.src found
	+# under SCHEMA_DIR. Used by the per-provider loops below.
	+list_providers() {
	+ [ -d "${SCHEMA_DIR}" ] \|\| return 0
	+ for schema_path in $(find "${SCHEMA_DIR}" \
	+ -name '*_eventlog_schema.src' 2>/dev/null \| sort); do
	+ awk '/^PROVIDER/ {print tolower($2); exit}' \
	+ "${schema_path}" 2>/dev/null \|\| true
	+ done
	+}
	+
	+# Step 1: Generate individual consumer headers for each schema.
	+if [ -d "${SCHEMA_DIR}" ]; then
	+ for schema_path in $(find "${SCHEMA_DIR}" \
	+ -name '*_eventlog_schema.src' 2>/dev/null \| sort); do
	+ provider=$(awk '/^PROVIDER/ {print tolower($2); exit}' \
	+ "${schema_path}" 2>/dev/null \|\| true)
	+ if [ -n "${provider}" ]; then
	+ (cd "${SRCTOP}" && \
	+ awk -v outdir="${ABS_HEADER_DIR}" \
	+ -f "${AWK_SCRIPT}" "${schema_path}" -c)
	+ fi
	+ done
	+fi
	+
	+# Step 2: Generate master consumer header that includes all provider
	+# headers.
	+case "${MASTER_HEADER}" in
	+ /*)
	+ ABS_MASTER_HEADER="${MASTER_HEADER}"
	+ ;;
	+ *)
	+ ABS_MASTER_HEADER="${ABS_HEADER_DIR}/${MASTER_HEADER}"
	+ ;;
	+esac
	+
	+cat > "${ABS_MASTER_HEADER}" << 'EOF'
	+/* Auto-generated consumer header - includes all provider consumer headers */
	+#ifndef _EVENTLOG_CONSUMER_H_
	+#define _EVENTLOG_CONSUMER_H_
	+
	+#include <sys/eventlog.h>
	+EOF
	+
	+for provider in $(list_providers); do
	+ echo "#include \"${provider}_eventlog_consumer.h\"" \
	+ >> "${ABS_MASTER_HEADER}"
	+done
	+
	+cat >> "${ABS_MASTER_HEADER}" << 'EOF'
	+
	+/*
	+ * Check if event is SESSION_END (fixed ID, all providers). Include
	+ * sys/eventlog.h for EVENTLOG_SESSION_END_ID.
	+ */
	+static inline bool
	+eventlog_is_session_end(const char *provider_name, uint32_t event_id)
	+{
	+ (void)provider_name;
	+ return event_id == EVENTLOG_SESSION_END_ID;
	+}
	+EOF
	+
	+cat >> "${ABS_MASTER_HEADER}" << 'EOF'
	+
	+/* Master formatting function that routes to per-provider formatters */
	+static inline int
	+eventlog_format_payload(const char provider_name, const void payload,
	+ size_t payload_size, uint32_t event_id, char *buf, size_t bufsize)
	+{
	+EOF
	+
	+for provider in $(list_providers); do
	+ {
	+ echo " if (strcmp(provider_name, \"${provider}\") == 0)"
	+ echo " return ${provider}_eventlog_format_payload("
	+ echo " payload, payload_size, event_id,"
	+ echo " buf, bufsize);"
	+ } >> "${ABS_MASTER_HEADER}"
	+done
	+
	+cat >> "${ABS_MASTER_HEADER}" << 'EOF'
	+ return snprintf(buf, bufsize, "[UNKNOWN_PROVIDER:%s]",
	+ provider_name);
	+}
	+EOF
	+
	+cat >> "${ABS_MASTER_HEADER}" << 'EOF'
	+
	+/* Master event ID to name lookup (routes to per-provider lookups) */
	+static inline const char *
	+eventlog_event_id_to_name(const char *provider_name, uint32_t event_id)
	+{
	+ if (event_id == EVENTLOG_SESSION_END_ID)
	+ return "SESSION_END";
	+EOF
	+
	+for provider in $(list_providers); do
	+ {
	+ echo " if (strcmp(provider_name, \"${provider}\") == 0)"
	+ echo " return ${provider}_eventlog_event_id_to_name(" \
	+ "event_id);"
	+ } >> "${ABS_MASTER_HEADER}"
	+done
	+
	+cat >> "${ABS_MASTER_HEADER}" << 'EOF'
	+ return NULL;
	+}
	+EOF
	+
	+cat >> "${ABS_MASTER_HEADER}" << 'EOF'
	+
	+/* Master keyword name to bitmask lookup (routes to per-provider lookups) */
	+static inline uint32_t
	+eventlog_keyword_from_string(const char provider_name, const char name)
	+{
	+EOF
	+
	+for provider in $(list_providers); do
	+ {
	+ fn="${provider}_eventlog_keyword_from_string"
	+ echo " if (strcmp(provider_name, \"${provider}\") == 0)"
	+ echo " return ${fn}(name);"
	+ } >> "${ABS_MASTER_HEADER}"
	+done
	+
	+cat >> "${ABS_MASTER_HEADER}" << 'EOF'
	+ return (0);
	+}
	+
	+#endif /* _EVENTLOG_CONSUMER_H_ */
	+EOF

File Metadata

Mime Type: text/plain
Expires: Wed, May 20, 11:28 AM (8 h, 3 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 33267378
Default Alt Text: D56979.diff (423 KB)

D56979.diffNo OneTemporaryActions

D56979.diffView Options

File Metadata

Event Timeline

D56979.diff
No OneTemporary
Actions

D56979.diff
View Options