feat(audit): add phase0 run/reaction baseline audit events

2026-02-25 00:12:31 -08:00
parent c89889d9c1
commit 23b813a92f
6 changed files with 684 additions and 3 deletions
@@ -0,0 +1,259 @@
 # Flynn Deeper End-User Surfaces + Integrated Behavior Stack Plan
 Date: 2026-02-25  
 Status: proposed roadmap  
 Scope: deepen assistant "product feel" (behavior semantics + user-facing surfaces) without rewriting Flynn core architecture
 ## Summary
 This plan adopts a balanced hybrid strategy:
 1. Improve behavior semantics first where correctness risk is highest (interrupt/cancel/run control).
 2. In parallel, ship selective deeper user surfaces (companion, canvas persistence, voice continuity).
 3. Land each slice with explicit observability gates so rollout decisions are data-driven.
 ## Why This Plan
 Flynn already has strong foundations:
 - Queue + session orchestration: `src/gateway/lane-queue.ts`, `src/gateway/session-bridge.ts`
 - Multi-path routing and backend fallback: `src/daemon/routing.ts`
 - Companion RPC foundation: `src/companion/runtimeClient.ts`, `src/gateway/protocol.ts`
 - Canvas API baseline: `src/gateway/handlers/canvas.ts`, `src/gateway/canvas-store.ts`
 - Voice in/out primitives: `src/models/media.ts`, `src/models/tts.ts`
 - Reactions baseline: `src/automation/reactions.ts`
 Largest remaining gap vs OpenClaw-like "assistant feel" is integration behavior across those systems, not missing foundational architecture.
 ## Goals and Success Criteria
 1. Deterministic active-run control under bursty traffic
 2. Rich, safe proactive behavior stack
 3. Durable end-user surfaces for companion/canvas/voice
 4. Measurable reliability improvements across canary phases
 Quantitative success gates:
 1. Cancel-to-ack p95 <= 500ms on gateway sessions.
 2. Duplicate assistant responses caused by run preemption: 0 in integration tests.
 3. Reaction false-positive rate <= 3% in canary logs.
 4. Companion reconnect success >= 99% in soak tests.
 5. Canvas artifact persistence survives daemon restart in integration tests.
 6. Voice failures degrade to text-only replies with no dropped responses.
 ## Out of Scope
 1. Full native macOS/iOS/Android app suite in this phase set.
 2. Broad protocol redesign or protocol-version breaking changes.
 3. Pi backend expansion (kept separate from this roadmap until re-approval).
 ## Workstreams and Complexity
 | Workstream | Complexity | Main Risk |
 | --- | --- | --- |
 | Run-control semantics unification | High | race conditions and cancellation ordering |
 | Reactions + proactive behavior v2 | Medium-High | noisy or looping automation |
 | Companion + canvas + voice deepening | High | cross-surface consistency and restart behavior |
 | Rollout hardening + observability | Medium | incomplete canary signals |
 ## Phase 0 - Baseline Instrumentation and Guardrails
 Duration: 3-5 days
 Execution checklist:
 - `docs/plans/2026-02-25-phase0-instrumentation-ticket-checklist.md`
 ### Deliverables
 1. Add baseline event and metric instrumentation for:
   - run-state transitions,
   - cancellation path timings,
   - reaction match/skip reasons,
   - surface delivery outcomes.
 2. Define canary gate calculator inputs for each later phase.
 ### Files
 1. `src/audit/types.ts`
 2. `src/audit/logger.ts`
 3. `src/gateway/metrics.ts`
 4. `docs/api/PROTOCOL.md` (event semantics if needed)
 5. `docs/plans/state.json`
 ### Acceptance
 1. Baseline report generated before behavior changes.
 2. No runtime behavior changes in this phase, only observability.
 ## Phase 1 - Run-Control Semantics + Gateway UX Signals
 Duration: 2-3 weeks
 ### Objectives
 1. Enforce "latest wins" semantics when queue policy is `interrupt`.
 2. Align cancellation behavior between gateway and channel router paths.
 3. Expose user-visible run lifecycle state in gateway events/UI.
 ### Implementation
 1. Queue semantics hardening:
   - `src/gateway/lane-queue.ts`
   - ensure queued + active behavior rules are explicit and testable.
 2. Active run cancellation wiring:
   - `src/gateway/handlers/agent.ts`
   - `src/gateway/session-bridge.ts`
   - `src/daemon/routing.ts` (`activeRuns` parity behavior).
 3. Event surface:
   - add additive `run_state` event in `src/gateway/protocol.ts`
   - consume/render in `src/gateway/ui/pages/chat.js`.
 ### Test Plan
 1. `src/gateway/lane-queue.test.ts`: preemption ordering, overflow with interrupt, debounce edge cases.
 2. `src/gateway/handlers/agent.test.ts`: interrupt + active cancel + queued supersede flows.
 3. `src/daemon/routing.test.ts`: channel-path cancellation parity.
 4. `src/gateway/ui/pages/chat.test.ts`: `run_state` rendering and transitions.
 ### Acceptance
 1. Cancel-to-ack p95 <= 500ms.
 2. Zero duplicate final responses in integration suite.
 3. Backward compatibility for clients ignoring `run_state`.
 ## Phase 2 - Reactions and Proactive Behavior Stack V2
 Duration: 2 weeks
 ### Objectives
 1. Replace first-match reaction behavior with deterministic priority + cooldown semantics.
 2. Keep announce delivery safe and auditable.
 3. Prevent recursion/looping behavior.
 ### Config/API Additions (backward-compatible)
 Extend `automation.reactions[]` in `src/config/schema.ts` with:
 1. `priority` (number, default `100`)
 2. `cooldown_ms` (number, default `0`)
 3. `stop_on_match` (boolean, default `true`)
 Existing fields remain valid and unchanged.
 ### Implementation
 1. Reaction engine expansion:
   - `src/automation/reactions.ts` (or split `reactionEngine.ts` if needed).
 2. Routing integration:
   - `src/daemon/routing.ts` deterministic reaction resolution.
 3. Delivery consistency:
   - `src/automation/cron.ts`
   - `src/automation/webhooks.ts`
   - preserve `delivery_mode` semantics and audit metadata.
 ### Test Plan
 1. `src/automation/reactions.test.ts`:
   - priority conflict resolution,
   - cooldown suppression,
   - metadata and template rendering.
 2. `src/daemon/routing.test.ts`:
   - reaction trigger integration and command-path exclusion.
 3. `src/automation/cron.test.ts` / `src/automation/webhooks.test.ts`:
   - announce/isolation metadata correctness.
 ### Acceptance
 1. False-positive match rate <= 3% in canary.
 2. No reaction recursion loops.
 3. Deterministic rule selection under overlap.
 ## Phase 3 - Deeper Surfaces: Companion, Canvas Durability, Voice Continuity
 Duration: 3-4 weeks
 ### Objectives
 1. Upgrade companion from heartbeat-only utility to reliable daily-use surface.
 2. Make canvas artifacts durable across restart.
 3. Improve voice continuity behavior around cancellation and fallbacks.
 ### Implementation
 1. Companion hardening:
   - `src/cli/companion.ts`
   - `src/companion/runtimeClient.ts`
   - `src/gateway/handlers/node.ts`
   - focus on reconnect and subscription resilience.
 2. Canvas persistence:
   - `src/gateway/canvas-store.ts` (durable backing instead of in-memory only)
   - `src/gateway/handlers/canvas.ts`
   - UI rendering/inspection in `src/gateway/ui/pages/chat.js` (or dedicated canvas page).
 3. Voice continuity:
   - `src/daemon/routing.ts` (talk-mode + cancellation + output behavior)
   - `src/models/tts.ts`
   - channel adapter output checks where required.
 ### Test Plan
 1. Companion integration tests for reconnect and event continuity.
 2. Canvas store tests for restart durability and eviction policy.
 3. Voice tests for TTS errors, fallback to text, and interrupted runs.
 ### Acceptance
 1. Companion reconnect success >= 99% in soak.
 2. Canvas survives daemon restart in integration suite.
 3. Voice path never drops assistant reply when TTS fails.
 ## Phase 4 - Rollout, Hardening, and Operator Readiness
 Duration: 1 week
 ### Deliverables
 1. Canary rollout plan by feature flag/surface.
 2. Explicit rollback playbook.
 3. Operator docs and architecture/protocol docs synchronized.
 ### Documentation Updates (required in same PRs)
 1. `README.md`
 2. `docs/api/PROTOCOL.md`
 3. `docs/architecture/AGENT_DIAGRAM.md`
 4. `docs/architecture/GATEWAY_SESSIONS_AND_QUEUE.md`
 5. `docs/plans/state.json`
 ## Execution and Commit Structure
 1. Branch: `feature/deeper-surfaces-integrated-behavior-stack`
 2. Atomic commits per slice:
   - implementation + tests + docs + `state.json` together
 3. Rebase onto `main` before merge.
 4. Fast-forward merge only.
 ## Model-Tier Delegation Plan for Implementation Work
 1. `claude-haiku-4.5`:
   - mechanical schema/test/doc updates.
 2. `claude-sonnet-4.6`:
   - default implementation tasks across queue/routing/companion/canvas.
 3. `claude-opus-4.6`:
   - concurrency semantics review, failure-mode design, and rollout gate design.
 ## Risks and Mitigations
 1. Risk: preemption races create duplicate or orphaned replies.  
   Mitigation: run-state event model + deterministic cancellation tests.
 2. Risk: proactive rules become noisy.  
   Mitigation: priority/cooldown/stop semantics + canary thresholds.
 3. Risk: deeper surfaces drift from core behavior semantics.  
   Mitigation: shared gateway protocol contracts and integration tests across surfaces.
 ## Default Decisions Locked
 1. Keep gateway protocol backward-compatible (additive only).
 2. Prioritize behavior reliability before broadening platform count.
 3. Use companion CLI/runtime path as first deep-surface target.
 4. Keep Pi expansion out of this roadmap until separate canary re-approval.
@@ -0,0 +1,205 @@
 # Phase 0 Ticket Checklist: Baseline Instrumentation and Guardrails
 Created: 2026-02-25  
 Owner: Flynn core  
 Status: ready to implement  
 Parent: `docs/plans/2026-02-25-deeper-end-user-surfaces-and-integrated-behavior-stack-plan.md`
 ## Goal
 Establish a measurement baseline for deeper-surface and behavior-stack work before changing runtime semantics.
 This checklist intentionally avoids behavior changes. It adds instrumentation and reporting only.
 ## PR Boundary
 In scope:
 1. Audit event coverage for run lifecycle, cancel intent/result, and reaction decision paths.
 2. Gateway metrics support for cancel latency and run transition counters.
 3. Baseline summary tooling for repeatable canary comparisons.
 4. Documentation updates for new telemetry fields and collection workflow.
 Out of scope:
 1. Queue semantic changes (`interrupt`, `steer`, active preemption behavior).
 2. New gateway event surfaces to clients (`run_state` comes in Phase 1).
 3. Reactions behavior changes (priority/cooldown/stop semantics come in Phase 2).
 4. Companion/canvas persistence behavior changes (Phase 3).
 ## Ticket 0.1 — Audit Schema Extension
 Status: completed (2026-02-25)
 ### Scope
 Add additive audit event types and payload contracts:
 1. `run.state` (start, complete, cancel_requested, cancelled, error)
 2. `run.cancel` (source, requested, acknowledged, latency_ms)
 3. `reaction.match` (rule_name, channel, sender, filter_summary)
 4. `reaction.skip` (reason category, candidate_count)
 ### Files
 1. `src/audit/types.ts`
 2. `src/audit/logger.ts`
 3. `src/audit/logger.test.ts`
 ### Acceptance
 1. All new events are additive and backward-compatible.
 2. Audit logger exposes typed methods for each new event.
 3. Unit tests cover serialization and path expansion still passes.
 ### Suggested commit message
 `feat(audit): add run lifecycle and reaction decision event types`
 ## Ticket 0.2 — Gateway/Router Emitters for Baseline Events
 ### Scope
 Emit new audit events without changing request handling behavior:
 1. Gateway `agent.send` path:
   - emit run start/complete/error states,
   - emit cancel request/ack timing when `/stop` or cancellation path is used.
 2. Daemon routing path:
   - emit matching run lifecycle events for channel-origin sessions,
   - emit reaction match/skip events around `matchReactionPrompt(...)`.
 ### Files
 1. `src/gateway/handlers/agent.ts`
 2. `src/daemon/routing.ts`
 3. `src/gateway/handlers/agent.test.ts`
 4. `src/daemon/routing.test.ts`
 ### Acceptance
 1. Event emission is side-effect free and does not alter reply behavior.
 2. Run IDs/session IDs are consistent with existing audit conventions.
 3. Existing tests continue passing; new assertions verify emitted metadata.
 ### Suggested commit message
 `feat(observability): emit run and reaction baseline audit events`
 ## Ticket 0.3 — Metrics Collector Baseline Counters
 ### Scope
 Extend in-memory gateway metrics with baseline counters:
 1. Run-state counters by outcome (`completed`, `cancelled`, `errored`).
 2. Cancel-latency histogram buckets (or ring-buffer sample list).
 3. Reaction decision counters (`matched`, `skipped`, per-reason).
 ### Files
 1. `src/gateway/metrics.ts`
 2. `src/gateway/metrics.test.ts`
 3. `src/gateway/handlers/observability.ts` (if exposing additional metrics fields)
 4. `src/gateway/handlers/observability.test.ts` (if needed)
 ### Acceptance
 1. New metrics appear in snapshots with stable defaults.
 2. No regression to existing dashboard/observability consumers.
 3. Tests validate accumulation, reset behavior (if any), and shape compatibility.
 ### Suggested commit message
 `feat(metrics): add baseline run and reaction counters for phase-0`
 ## Ticket 0.4 — Baseline Summary Tooling
 ### Scope
 Add or extend report tooling to summarize phase-0 telemetry slices:
 1. Run completion/error/cancel rates by channel/session.
 2. Cancel latency p50/p95.
 3. Reaction match/skip rates and top skip reasons.
 4. Optional markdown + json output for plan artifacts.
 ### Files
 1. `src/audit/backendCanarySummary.ts` (extend or split shared summarizer helpers)
 2. `src/audit/backendCanarySummary.test.ts`
 3. `scripts/summarize-backend-canary.ts` (or new script if cleaner)
 4. `package.json` (script entry if needed)
 ### Acceptance
 1. Tool works on current audit logs without requiring new event types.
 2. New event types are incorporated when present.
 3. Output is deterministic and safe on missing fields.
 ### Suggested commit message
 `feat(audit): add phase-0 baseline summary metrics for runs and reactions`
 ## Ticket 0.5 — Docs + Diagram + State Sync
 ### Scope
 Document new observability fields and baseline workflow:
 1. Protocol/observability notes (if gateway payloads changed).
 2. Architecture docs review for run lifecycle observability path.
 3. Plan/state updates with executed commands and artifacts.
 ### Files
 1. `README.md`
 2. `docs/api/PROTOCOL.md`
 3. `docs/architecture/AGENT_DIAGRAM.md`
 4. `docs/architecture/GATEWAY_SESSIONS_AND_QUEUE.md`
 5. `docs/plans/state.json`
 6. `docs/plans/artifacts/*` (baseline outputs)
 ### Acceptance
 1. Diagram review is explicitly documented.
 2. `state.json` includes ticket completion/test status.
 3. Artifact links are present and reproducible.
 ### Suggested commit message
 `docs(observability): document phase-0 telemetry and baseline workflow`
 ## Validation Commands (per completed ticket set)
 Run focused suites first, then full validation when phase is complete:
 ```bash
 pnpm typecheck
 pnpm test:run src/audit/logger.test.ts
 pnpm test:run src/gateway/metrics.test.ts
 pnpm test:run src/gateway/handlers/agent.test.ts
 pnpm test:run src/daemon/routing.test.ts
 pnpm test:run src/audit/backendCanarySummary.test.ts
 pnpm test:run src/gateway/handlers/observability.test.ts
 pnpm test:run
 pnpm lint
 pnpm build
 ```
 ## Rollout and Exit Criteria
 Phase 0 is complete when:
 1. Baseline metrics are emitted for at least one representative channel session and one gateway session.
 2. A baseline summary artifact is generated and committed under `docs/plans/artifacts/`.
 3. No user-visible response behavior changed compared to pre-phase baseline.
 ## Subagent Model Assignment Plan
 1. `claude-haiku-4.5`:
   - schema additions, mechanical logger wiring, docs consistency edits.
 2. `claude-sonnet-4.6`:
   - gateway/router instrumentation and metrics implementation.
 3. `claude-opus-4.6`:
   - event taxonomy review, baseline gate design, failure-mode coverage review.
@@ -1,8 +1,45 @@
 {
  "version": "1.0",
-  "updated_at": "2026-02-24",
+  "updated_at": "2026-02-25",
  "description": "Tracks the status of all Flynn plans and implementation phases",
  "plans": {
    "phase0-ticket-0.1-audit-schema-extension": {
      "status": "completed",
      "date": "2026-02-25",
      "updated": "2026-02-25",
      "summary": "Implemented Phase 0 Ticket 0.1 by extending audit event taxonomy with run lifecycle and reaction decision events (`run.state`, `run.cancel`, `reaction.match`, `reaction.skip`), adding typed logger methods/level routing, and adding regression coverage for serialization and level expectations.",
      "files_modified": [
        "src/audit/types.ts",
        "src/audit/logger.ts",
        "src/audit/logger.test.ts",
        "docs/plans/2026-02-25-phase0-instrumentation-ticket-checklist.md",
        "docs/plans/state.json"
      ],
      "test_status": "pnpm test:run src/audit/logger.test.ts + pnpm typecheck passing"
    },
    "phase0-instrumentation-ticket-checklist": {
      "status": "completed",
      "date": "2026-02-25",
      "updated": "2026-02-25",
      "summary": "Added a decision-complete Phase 0 implementation ticket checklist for deeper surfaces/behavior-stack baseline instrumentation, including atomic ticket boundaries, file-level scope, acceptance criteria, validation commands, and model-tier subagent assignments.",
      "files_modified": [
        "docs/plans/2026-02-25-phase0-instrumentation-ticket-checklist.md",
        "docs/plans/2026-02-25-deeper-end-user-surfaces-and-integrated-behavior-stack-plan.md",
        "docs/plans/state.json"
      ],
      "test_status": "planning/docs update only; no runtime code changes"
    },
    "deeper-end-user-surfaces-and-integrated-behavior-stack-plan": {
      "status": "completed",
      "date": "2026-02-25",
      "updated": "2026-02-25",
      "summary": "Added a decision-complete roadmap for deepening Flynn's end-user surfaces and integrated behavior stack (balanced hybrid: run-control semantics, reactions/proactive behavior v2, companion/canvas/voice surface depth, and canary rollout gates) with explicit success criteria and file-level implementation scope.",
      "files_modified": [
        "docs/plans/2026-02-25-deeper-end-user-surfaces-and-integrated-behavior-stack-plan.md",
        "docs/plans/state.json"
      ],
      "test_status": "planning/docs update only; no runtime code changes"
    },
    "pi-embedded-manual-runtime-mode-control": {
      "status": "completed",
      "date": "2026-02-24",
@@ -6588,7 +6625,10 @@
    "model_router_correctness": "completed — fallback paths now avoid duplicate clients, apply retry policy consistently, and reject unsupported OpenAI OAuth tool requests early",
    "native_audio_support": "completed — smart routing for native audio (Gemini/OpenAI/GitHub) vs Whisper transcription fallback, plus 2026-02-23 arg hydration hardening, tool.args_rewritten audit metric, transient fetch retry/timeout hardening, localhost->127.0.0.1 fallback for transcription endpoint connectivity, and whisper docker-compose entrypoint arg fix for port 18801",
    "remaining_phases_completion": "Phase 1: 3/3 (100%) — context levels, command registry, memory structure. Phase 2: 3/3 (100%) — component registry, confidence routing, history index. Phase 3: 2/2 (100%) — adaptive memory/compaction, truthfulness/autonomy hardening",
-    "next_up": "Track OpenClaw evolution regularly for inspiration and feature ideas",
+    "deeper_surfaces_behavior_stack_plan": "completed — documented a decision-complete balanced-hybrid roadmap for OpenClaw-like end-user surface depth plus integrated behavior semantics with phased scope, acceptance gates, and rollout constraints",
    "deeper_surfaces_phase0_ticket_pack": "completed — produced an atomic implementation checklist for Phase 0 baseline observability work (audit events, router/gateway emitters, metrics counters, baseline summary tooling, docs sync)",
    "deeper_surfaces_phase0_ticket_01": "completed — audit schema/logger now capture run lifecycle and reaction decision baseline events (`run.state`, `run.cancel`, `reaction.match`, `reaction.skip`) with regression test coverage",
    "next_up": "Implement Ticket 0.2 from docs/plans/2026-02-25-phase0-instrumentation-ticket-checklist.md",
    "pi_embedded_canary_spike": "completed — added optional pi_embedded backend adapter, canary-safe no-tools routing guard, backend success/fallback latency audit events, and docs/diagram updates while native remains default",
    "pi_embedded_evaluation_phase": "completed — final decision rollback (applied in runtime config): Window A failed latency/fallback gates (p50 +259ms, p95 +5695ms, fallback 25%, categories: pi_module_interface/empty_assistant_text); Window B remained sample-insufficient; controlled probes verified guard coverage (pi_no_tools_mode/capability_query/attachments_present each hit once)",
    "pi_embedded_manual_mode": "completed — added persisted runtime backend controls for manual Pi activation/deactivation (`/runtime` preferred, `/backend` alias; `status`, `activate pi`, `deactivate pi`, `use config`) while keeping config-driven default routing",
@@ -4,6 +4,20 @@ import { join, resolve } from 'path';
 import { describe, expect, it } from 'vitest';
 import { AuditLogger } from './logger.js';
 function waitForFlush(): Promise<void> {
  return new Promise((resolvePromise) => setTimeout(resolvePromise, 25));
 }
 function readAuditEvents(filePath: string): Array<{ event_type: string; level: string; event: Record<string, unknown> }> {
  const content = readFileSync(filePath, 'utf-8').trim();
  if (!content) {
    return [];
  }
  return content
    .split('\n')
    .map((line) => JSON.parse(line) as { event_type: string; level: string; event: Record<string, unknown> });
 }
 describe('AuditLogger', () => {
  it('expands ~ in audit path before writing logs', async () => {
    const previousHome = process.env.HOME;
@@ -28,7 +42,7 @@ describe('AuditLogger', () => {
      logger.systemStart('test-component');
      await logger.close();
-      await new Promise((resolvePromise) => setTimeout(resolvePromise, 25));
+      await waitForFlush();
      const loggerPath = (logger as unknown as { config: { path: string } }).config.path;
      const expectedPath = resolve(tempHome, '.local/share/flynn/audit.log');
@@ -46,4 +60,98 @@ describe('AuditLogger', () => {
      rmSync(tempHome, { recursive: true, force: true });
    }
  });
  it('writes run and reaction baseline events with expected levels', async () => {
    const previousHome = process.env.HOME;
    const tempHome = mkdtempSync(join(tmpdir(), 'flynn-audit-home-'));
    process.env.HOME = tempHome;
    try {
      const logger = new AuditLogger({
        enabled: true,
        path: '~/.local/share/flynn/audit.log',
        max_size_mb: 10,
        keep_days: 30,
        levels: {
          tools: 'debug',
          sessions: 'debug',
          automation: 'debug',
        },
      });
      logger.runState({
        session_id: 'telegram:123',
        channel: 'telegram',
        sender: '123',
        source: 'channel',
        state: 'start',
        request_id: 'req-1',
      });
      logger.runState({
        session_id: 'telegram:123',
        channel: 'telegram',
        sender: '123',
        source: 'channel',
        state: 'error',
        request_id: 'req-1',
        error: 'model timeout',
      });
      logger.runCancel({
        session_id: 'telegram:123',
        channel: 'telegram',
        sender: '123',
        source: 'channel',
        requested: true,
        acknowledged: true,
        request_id: 'req-1',
        latency_ms: 120,
      });
      logger.reactionMatch({
        session_id: 'telegram:123',
        channel: 'telegram',
        sender: '123',
        source: 'channel',
        rule_name: 'daily-briefing-hint',
        candidate_count: 4,
        filter_summary: 'contains:briefing',
      });
      logger.reactionSkip({
        session_id: 'telegram:123',
        channel: 'telegram',
        sender: '123',
        source: 'channel',
        reason: 'no_match',
        candidate_count: 4,
      });
      await logger.close();
      await waitForFlush();
      const expectedPath = resolve(tempHome, '.local/share/flynn/audit.log');
      const events = readAuditEvents(expectedPath);
      const eventTypes = events.map((event) => event.event_type);
      expect(eventTypes).toContain('run.state');
      expect(eventTypes).toContain('run.cancel');
      expect(eventTypes).toContain('reaction.match');
      expect(eventTypes).toContain('reaction.skip');
      const runError = events.find((event) => (
        event.event_type === 'run.state'
        && event.event.state === 'error'
      ));
      expect(runError?.level).toBe('error');
      const reactionSkip = events.find((event) => event.event_type === 'reaction.skip');
      expect(reactionSkip?.level).toBe('debug');
      expect(reactionSkip?.event.reason).toBe('no_match');
    } finally {
      if (previousHome === undefined) {
        delete process.env.HOME;
      } else {
        process.env.HOME = previousHome;
      }
      rmSync(tempHome, { recursive: true, force: true });
    }
  });
 });
@@ -22,6 +22,10 @@ import type {
  SessionAutoCompactEvent,
  UserActionEvent,
  QueuePreemptEvent,
  RunStateEvent,
  RunCancelEvent,
  ReactionMatchEvent,
  ReactionSkipEvent,
  BackendRouteEvent,
  BackendSuccessEvent,
  BackendFallbackEvent,
@@ -211,6 +215,28 @@ export class AuditLogger {
    this.write({ level: 'info', event_type: 'queue.preempt', event: event as unknown as Record<string, unknown> });
  }
  runState(event: RunStateEvent): void {
    const level = event.state === 'error' ? 'error' : 'info';
    if (!this.shouldLog('sessions', level)) {return;}
    this.write({ level, event_type: 'run.state', event: event as unknown as Record<string, unknown> });
  }
  runCancel(event: RunCancelEvent): void {
    const level = event.acknowledged ? 'info' : 'warn';
    if (!this.shouldLog('sessions', level)) {return;}
    this.write({ level, event_type: 'run.cancel', event: event as unknown as Record<string, unknown> });
  }
  reactionMatch(event: ReactionMatchEvent): void {
    if (!this.shouldLog('sessions', 'info')) {return;}
    this.write({ level: 'info', event_type: 'reaction.match', event: event as unknown as Record<string, unknown> });
  }
  reactionSkip(event: ReactionSkipEvent): void {
    if (!this.shouldLog('sessions', 'debug')) {return;}
    this.write({ level: 'debug', event_type: 'reaction.skip', event: event as unknown as Record<string, unknown> });
  }
  backendRoute(event: BackendRouteEvent): void {
    if (!this.shouldLog('sessions', 'info')) {return;}
    this.write({ level: 'info', event_type: 'backend.route', event: event as unknown as Record<string, unknown> });
@@ -12,6 +12,8 @@ export type AuditEventType =
  // Session lifecycle
  | 'session.create' | 'session.message' | 'session.delete' | 'session.transfer' | 'session.compact' | 'session.checkpoint' | 'session.auto_compact' | 'user.action'
  | 'queue.preempt'
  | 'run.state' | 'run.cancel'
  | 'reaction.match' | 'reaction.skip'
  | 'backend.route' | 'backend.success' | 'backend.fallback'
  // Automation - Cron
  | 'cron.trigger' | 'cron.sent' | 'cron.add' | 'cron.remove'
@@ -232,6 +234,47 @@ export interface QueuePreemptEvent {
  cancelled_active_run: boolean;
 }
 export interface RunStateEvent {
  session_id: string;
  channel: string;
  sender: string;
  source: 'gateway' | 'channel';
  state: 'start' | 'complete' | 'cancel_requested' | 'cancelled' | 'error';
  request_id?: string;
  duration_ms?: number;
  error?: string;
 }
 export interface RunCancelEvent {
  session_id: string;
  channel: string;
  sender: string;
  source: 'gateway' | 'channel';
  requested: boolean;
  acknowledged: boolean;
  request_id?: string;
  latency_ms?: number;
 }
 export interface ReactionMatchEvent {
  session_id?: string;
  channel: string;
  sender: string;
  source: 'gateway' | 'channel';
  rule_name: string;
  candidate_count?: number;
  filter_summary?: string;
 }
 export interface ReactionSkipEvent {
  session_id?: string;
  channel: string;
  sender: string;
  source: 'gateway' | 'channel';
  reason: 'no_rules' | 'no_match' | 'disabled' | 'channel_mismatch' | 'filter_miss';
  candidate_count: number;
 }
 export interface BackendRouteEvent {
  session_id: string;
  channel: string;