From 20224f1601f77d2d51c2f45c80233aa33878f353 Mon Sep 17 00:00:00 2001 From: William Valentin Date: Fri, 27 Feb 2026 09:01:43 -0800 Subject: [PATCH] feat(audit): add phase0 backend drift and freshness gates --- README.md | 9 +- docs/api/PROTOCOL.md | 2 +- docs/architecture/AGENT_DIAGRAM.md | 1 + .../GATEWAY_SESSIONS_AND_QUEUE.md | 1 + ...phase0-instrumentation-ticket-checklist.md | 2 +- docs/plans/state.json | 25 +- package.json | 2 + .../check-phase0-baseline-backend-drift.ts | 446 ++++++++++++++++++ src/audit/phase0BaselineDrift.test.ts | 340 +++++++++++++ src/audit/phase0BaselineDrift.ts | 336 +++++++++++++ 10 files changed, 1157 insertions(+), 7 deletions(-) create mode 100644 scripts/check-phase0-baseline-backend-drift.ts create mode 100644 src/audit/phase0BaselineDrift.test.ts create mode 100644 src/audit/phase0BaselineDrift.ts diff --git a/README.md b/README.md index 28064bd..8afeb0e 100644 --- a/README.md +++ b/README.md @@ -1640,9 +1640,14 @@ One-shot refresh for both channel + gateway live windows: pnpm audit:phase0-baseline:live:refresh ``` -Cadence scheduling (example: every 6 hours via host cron): +Backend drift/freshness gate for backend-scoped artifacts (`pi_embedded` vs `native`): ```bash -0 */6 * * * cd /path/to/flynn && pnpm audit:phase0-baseline:live:refresh >> ~/.local/share/flynn/phase0_baseline_refresh.log 2>&1 +pnpm audit:phase0-baseline:live:drift +``` + +Cadence scheduling (example: every 6 hours via host cron) with drift check: +```bash +0 */6 * * * cd /path/to/flynn && pnpm audit:phase0-baseline:live:refresh:drift >> ~/.local/share/flynn/phase0_baseline_refresh.log 2>&1 ``` `audit:phase0-baseline:live*` scripts now default to the current UTC date tag when `--tag` is omitted. diff --git a/docs/api/PROTOCOL.md b/docs/api/PROTOCOL.md index 9de9caf..f59321e 100644 --- a/docs/api/PROTOCOL.md +++ b/docs/api/PROTOCOL.md @@ -23,7 +23,7 @@ The gateway provides: - **HTTP Server**: Serves static dashboard and handles webhook endpoints - **Node Capability Negotiation**: Optional companion-node role/capability registration -Operational note: onboarding (`flynn setup` / `flynn onboard`) now runs post-save live readiness checks (model/channel/memory/automation) and prints a guided first-success task flow. Companion CLI now also supports bootstrap-manifest export (`flynn companion --export-bootstrap `), release-bundle export (`--export-release-bundle ` with optional `--signing-key`/`--signing-key-id` signature output), release-bundle verification (`--verify-release-bundle ` with optional `--verify-signing-key`/`--verify-signing-key-id`/`--require-signature`), platform shell-template export (`--export-shell-template `), plus richer shell bootstrap flags for status/location/push (`--app-version`, `--latitude/--longitude`, `--push-token`, etc.) for desktop/mobile app packaging without changing JSON-RPC method/event shapes. Audit observability now includes live phase-0 baseline capture flows: `pnpm audit:phase0-baseline:live` for channel-origin windows, backend-scoped variants (`pnpm audit:phase0-baseline:live:pi` / `pnpm audit:phase0-baseline:live:native`) via `--backend`, `pnpm audit:phase0-baseline:live:gateway` (auto-detected cancel window) for gateway-origin windows, and `pnpm audit:phase0-baseline:live:refresh` for one-shot refresh of both windows. These scripts default to current UTC-date tags unless `--tag` is explicitly provided. +Operational note: onboarding (`flynn setup` / `flynn onboard`) now runs post-save live readiness checks (model/channel/memory/automation) and prints a guided first-success task flow. Companion CLI now also supports bootstrap-manifest export (`flynn companion --export-bootstrap `), release-bundle export (`--export-release-bundle ` with optional `--signing-key`/`--signing-key-id` signature output), release-bundle verification (`--verify-release-bundle ` with optional `--verify-signing-key`/`--verify-signing-key-id`/`--require-signature`), platform shell-template export (`--export-shell-template `), plus richer shell bootstrap flags for status/location/push (`--app-version`, `--latitude/--longitude`, `--push-token`, etc.) for desktop/mobile app packaging without changing JSON-RPC method/event shapes. Audit observability now includes live phase-0 baseline capture flows: `pnpm audit:phase0-baseline:live` for channel-origin windows, backend-scoped variants (`pnpm audit:phase0-baseline:live:pi` / `pnpm audit:phase0-baseline:live:native`) via `--backend`, `pnpm audit:phase0-baseline:live:gateway` (auto-detected cancel window) for gateway-origin windows, `pnpm audit:phase0-baseline:live:refresh` for one-shot refresh of both windows, and `pnpm audit:phase0-baseline:live:drift` for backend artifact freshness/drift gates. These scripts default to current UTC-date tags unless `--tag` is explicitly provided. ### Execution Model (Sessions + Per-Session Queue) diff --git a/docs/architecture/AGENT_DIAGRAM.md b/docs/architecture/AGENT_DIAGRAM.md index c73f8c0..e660bce 100644 --- a/docs/architecture/AGENT_DIAGRAM.md +++ b/docs/architecture/AGENT_DIAGRAM.md @@ -170,6 +170,7 @@ Gateway streaming UX signals: - `pnpm audit:phase0-baseline:live:pi` and `pnpm audit:phase0-baseline:live:native` capture backend-scoped channel windows using `backend.route` timelines. - `pnpm audit:phase0-baseline:live:gateway` captures gateway-origin baseline windows by auto-selecting the latest cancel/cancelled session window (or use `scripts/capture-phase0-live-baseline.ts --source gateway --since ... --until ...` for explicit windows). - `pnpm audit:phase0-baseline:live:refresh` runs both channel + gateway capture commands in one step for cadence refreshes. +- `pnpm audit:phase0-baseline:live:drift` evaluates backend-scoped artifact freshness/drift gates, and `pnpm audit:phase0-baseline:live:refresh:drift` runs capture + drift checks in one cadence step. - `audit:phase0-baseline:live*` scripts are cadence-safe by default (UTC-date tags auto-generated unless explicitly overridden). - Canvas artifacts are persisted by the gateway so session UI surfaces can recover after daemon restarts. - TTS synthesis uses an ordered provider chain with health cooldown tracking; if all providers fail, replies degrade to text-only without dropping the response. diff --git a/docs/architecture/GATEWAY_SESSIONS_AND_QUEUE.md b/docs/architecture/GATEWAY_SESSIONS_AND_QUEUE.md index d813d85..d0da820 100644 --- a/docs/architecture/GATEWAY_SESSIONS_AND_QUEUE.md +++ b/docs/architecture/GATEWAY_SESSIONS_AND_QUEUE.md @@ -35,6 +35,7 @@ If you only want the protocol surface, see `docs/api/PROTOCOL.md`. - Backend-scoped channel snapshots can be regenerated with `pnpm audit:phase0-baseline:live:pi` / `pnpm audit:phase0-baseline:live:native` (`--backend` filtering via `backend.route` timelines). - Gateway-origin phase-0 windows (including cancel-path samples) can be captured with `pnpm audit:phase0-baseline:live:gateway` (auto-detect latest cancel window) or `scripts/capture-phase0-live-baseline.ts --source gateway --since ... --until ...` for explicit bounds. - `pnpm audit:phase0-baseline:live:refresh` runs both capture paths to refresh channel + gateway artifacts in one command. +- `pnpm audit:phase0-baseline:live:drift` checks backend-scoped artifact freshness/drift gates; `pnpm audit:phase0-baseline:live:refresh:drift` chains refresh + drift checks for scheduled cadence runs. - `audit:phase0-baseline:live*` package scripts now omit fixed tags so scheduled runs automatically roll to current UTC-date artifact tags. - Companion CLI supports one-shot shell bootstrap metadata for live sessions (`--app-version`/`--status-text`, `--latitude`/`--longitude`, `--push-token`) so desktop/mobile wrappers can initialize node status/location/push in a single launch flow. - Canvas artifacts are persisted per session under the gateway data directory for UI recovery across restarts. diff --git a/docs/plans/2026-02-25-phase0-instrumentation-ticket-checklist.md b/docs/plans/2026-02-25-phase0-instrumentation-ticket-checklist.md index 570e3af..3e37939 100644 --- a/docs/plans/2026-02-25-phase0-instrumentation-ticket-checklist.md +++ b/docs/plans/2026-02-25-phase0-instrumentation-ticket-checklist.md @@ -203,7 +203,7 @@ Phase 0 is complete when: 2. A baseline summary artifact is generated and committed under `docs/plans/artifacts/`. 3. No user-visible response behavior changed compared to pre-phase baseline. -Follow-up status (2026-02-27): live channel-session artifacts exist under `docs/plans/artifacts/phase0_baseline_live_2026-02-27.*` via `pnpm audit:phase0-baseline:live` (anonymized IDs), and a second gateway-origin live window (including `run.cancel` + `cancel_requested`/`cancelled`) exists under `docs/plans/artifacts/phase0_baseline_live_gateway_2026-02-27.*`. Gateway window refreshes can now run via `pnpm audit:phase0-baseline:live:gateway` (auto-selected cancel window), both windows can be refreshed together with `pnpm audit:phase0-baseline:live:refresh` (scheduling example included in README), and backend-scoped channel windows are now available via `pnpm audit:phase0-baseline:live:pi` / `pnpm audit:phase0-baseline:live:native`. +Follow-up status (2026-02-27): live channel-session artifacts exist under `docs/plans/artifacts/phase0_baseline_live_2026-02-27.*` via `pnpm audit:phase0-baseline:live` (anonymized IDs), and a second gateway-origin live window (including `run.cancel` + `cancel_requested`/`cancelled`) exists under `docs/plans/artifacts/phase0_baseline_live_gateway_2026-02-27.*`. Gateway window refreshes can now run via `pnpm audit:phase0-baseline:live:gateway` (auto-selected cancel window), both windows can be refreshed together with `pnpm audit:phase0-baseline:live:refresh` (scheduling example included in README), backend-scoped channel windows are now available via `pnpm audit:phase0-baseline:live:pi` / `pnpm audit:phase0-baseline:live:native`, and backend artifact freshness/drift checks are now available via `pnpm audit:phase0-baseline:live:drift` (or chained with `pnpm audit:phase0-baseline:live:refresh:drift`). ## Subagent Model Assignment Plan diff --git a/docs/plans/state.json b/docs/plans/state.json index 4b520be..78aa33c 100644 --- a/docs/plans/state.json +++ b/docs/plans/state.json @@ -196,6 +196,25 @@ ], "test_status": "pnpm audit:phase0-baseline:live:pi + pnpm audit:phase0-baseline:live:native + pnpm test:run src/audit/phase0LiveBaseline.test.ts src/audit/phase0BaselineSummary.test.ts + pnpm typecheck passing" }, + "phase0-live-baseline-backend-drift-monitoring": { + "status": "completed", + "date": "2026-02-27", + "updated": "2026-02-27", + "summary": "Implemented backend-scoped phase-0 baseline drift/freshness gates for cadence monitoring by adding artifact comparison helpers, threshold evaluation, and an auto-discovery CLI (`check-phase0-baseline-backend-drift.ts`) with package scripts for standalone checks and chained refresh+drift runs.", + "files_modified": [ + "src/audit/phase0BaselineDrift.ts", + "src/audit/phase0BaselineDrift.test.ts", + "scripts/check-phase0-baseline-backend-drift.ts", + "package.json", + "README.md", + "docs/api/PROTOCOL.md", + "docs/architecture/AGENT_DIAGRAM.md", + "docs/architecture/GATEWAY_SESSIONS_AND_QUEUE.md", + "docs/plans/2026-02-25-phase0-instrumentation-ticket-checklist.md", + "docs/plans/state.json" + ], + "test_status": "pnpm test:run src/audit/phase0BaselineDrift.test.ts + pnpm audit:phase0-baseline:live:drift + pnpm typecheck passing" + }, "phase0-instrumentation-ticket-checklist": { "status": "completed", "date": "2026-02-25", @@ -7345,7 +7364,7 @@ } }, "overall_progress": { - "total_test_count": 2589, + "total_test_count": 2590, "all_tests_passing": true, "p0_completion": "3/3 (100%)", "p1_completion": "4/4 (100%)", @@ -7381,8 +7400,8 @@ "deeper_surfaces_phase0_ticket_02": "completed — gateway + daemon routing emit run lifecycle/cancel telemetry and reaction match/skip audit events with filter summaries and cancellation latency, plus focused tests", "deeper_surfaces_phase0_ticket_03": "completed — gateway metrics now track run-state outcomes, cancel latency samples, and reaction decision counters with routing/gateway emitters", "deeper_surfaces_phase0_ticket_04": "completed — added phase-0 baseline summary tooling for run outcomes, cancel latency, and reaction decisions with markdown/json CLI output", - "deeper_surfaces_phase0_ticket_05": "completed — documented phase-0 telemetry fields/workflow, refreshed architecture/protocol docs, and generated anonymized live baseline artifacts for channel, gateway, and backend-scoped (pi/native) traffic windows", - "next_up": "Apply scheduled `pnpm audit:phase0-baseline:live:refresh` in each active environment and monitor backend-scoped (`pi_embedded` vs `native`) artifact freshness/drift over at least one full cadence cycle before additional run-control/reaction semantic changes.", + "deeper_surfaces_phase0_ticket_05": "completed — documented phase-0 telemetry fields/workflow, refreshed architecture/protocol docs, generated anonymized live baseline artifacts for channel/gateway/backend-scoped (pi/native) windows, and added backend artifact freshness/drift gates (`pnpm audit:phase0-baseline:live:drift`)", + "next_up": "Run scheduled `pnpm audit:phase0-baseline:live:refresh:drift` in each active environment and observe at least one full cadence cycle before tightening drift thresholds or changing additional run-control/reaction semantics.", "pi_embedded_canary_spike": "completed — added optional pi_embedded backend adapter, canary-safe no-tools routing guard, backend success/fallback latency audit events, and docs/diagram updates while native remains default", "pi_embedded_evaluation_phase": "completed — final decision rollback (applied in runtime config): Window A failed latency/fallback gates (p50 +259ms, p95 +5695ms, fallback 25%, categories: pi_module_interface/empty_assistant_text); Window B remained sample-insufficient; controlled probes verified guard coverage (pi_no_tools_mode/capability_query/attachments_present each hit once)", "pi_embedded_manual_mode": "completed — added persisted runtime backend controls for manual Pi activation/deactivation (`/runtime` preferred, `/backend` alias; `status`, `activate pi`, `deactivate pi`, `use config`) while keeping config-driven default routing", diff --git a/package.json b/package.json index 14477ba..6d8d99a 100644 --- a/package.json +++ b/package.json @@ -27,6 +27,8 @@ "audit:phase0-baseline:live:native": "node --import tsx/esm scripts/capture-phase0-live-baseline.ts --audit ~/.local/share/flynn/audit.log --source channel --backend native --exclude-session-substring probe", "audit:phase0-baseline:live:gateway": "node --import tsx/esm scripts/capture-phase0-live-baseline.ts --audit ~/.local/share/flynn/audit.log --source gateway --auto-gateway-cancel-window", "audit:phase0-baseline:live:refresh": "pnpm audit:phase0-baseline:live && pnpm audit:phase0-baseline:live:gateway", + "audit:phase0-baseline:live:drift": "node --import tsx/esm scripts/check-phase0-baseline-backend-drift.ts --artifacts-dir docs/plans/artifacts --backend pi_embedded,native --max-age-hours 36 --min-candidate-sampled-events 10 --max-sampled-events-drop-pct 80 --max-run-outcomes-drop-pct 80 --max-completion-rate-drop-pp 35 --max-cancel-rate-increase-pp 25 --max-error-rate-increase-pp 25 --max-cancel-latency-p95-increase-ms 6000", + "audit:phase0-baseline:live:refresh:drift": "pnpm audit:phase0-baseline:live:refresh && pnpm audit:phase0-baseline:live:drift", "audit:backend-canary:probes": "node --import tsx/esm scripts/run-pi-canary-guard-probes.ts", "companion:bundle": "node --import tsx/esm scripts/build-companion-release-bundle.ts", "companion:reference-apps": "node --import tsx/esm scripts/export-companion-reference-apps.ts", diff --git a/scripts/check-phase0-baseline-backend-drift.ts b/scripts/check-phase0-baseline-backend-drift.ts new file mode 100644 index 0000000..777ecaf --- /dev/null +++ b/scripts/check-phase0-baseline-backend-drift.ts @@ -0,0 +1,446 @@ +#!/usr/bin/env node + +import { mkdir, readdir, readFile, writeFile } from 'node:fs/promises'; +import { dirname, resolve } from 'node:path'; +import { parseArgs } from 'node:util'; +import { + comparePhase0BaselineDrift, + evaluatePhase0BaselineDriftGate, + renderPhase0BaselineDriftSnapshot, + type Phase0BaselineArtifactDocument, + type Phase0BaselineDriftComparison, + type Phase0BaselineDriftGateThresholds, +} from '../src/audit/phase0BaselineDrift.js'; +import type { Phase0BackendTarget } from '../src/audit/phase0LiveBaseline.js'; + +type OutputFormat = 'markdown' | 'json'; + +interface ArtifactRecord { + backend: Phase0BackendTarget; + tag: string; + path: string; + generatedAtIso?: string; + generatedAtMs?: number; + document: Phase0BaselineArtifactDocument; +} + +interface BackendDriftResult { + backend: Phase0BackendTarget; + candidate: ArtifactRecord; + baseline?: ArtifactRecord; + comparison: Phase0BaselineDriftComparison; + freshness: { + enabled: boolean; + pass: boolean; + actual_age_hours: number | null; + threshold_hours: number | null; + }; + driftGate: ReturnType; + pass: boolean; +} + +const BACKEND_TARGETS: readonly Phase0BackendTarget[] = [ + 'native', + 'claude_code', + 'opencode', + 'codex', + 'gemini', + 'pi_embedded', +]; + +const ARTIFACT_JSON_PATTERN = /^phase0_baseline_live_backend_(native|claude_code|opencode|codex|gemini|pi_embedded)_(.+)\.json$/; + +function usage(): string { + return [ + 'Usage: node --import tsx/esm scripts/check-phase0-baseline-backend-drift.ts [options]', + '', + 'Options:', + ' --artifacts-dir Artifacts directory (default: docs/plans/artifacts)', + ' --backend Backends to check (default: pi_embedded,native)', + ' --tag Candidate artifact tag (default: latest available per backend)', + ' --baseline-tag Baseline artifact tag (default: previous available per backend)', + ' --max-age-hours Require candidate artifact freshness (optional)', + ' --require-baseline-history Fail when no prior artifact exists', + ' --format Output format (default: markdown)', + ' --out Write output to file instead of stdout', + '', + 'Drift thresholds (optional):', + ' --min-candidate-sampled-events ', + ' --min-baseline-sampled-events ', + ' --max-sampled-events-drop-pct ', + ' --max-run-outcomes-drop-pct ', + ' --max-completion-rate-drop-pp ', + ' --max-cancel-rate-increase-pp ', + ' --max-error-rate-increase-pp ', + ' --max-cancel-latency-p95-increase-ms ', + ].join('\n'); +} + +function parseCsv(value: string | undefined): string[] | undefined { + if (!value) { + return undefined; + } + const values = value + .split(',') + .map((item) => item.trim()) + .filter((item) => item.length > 0); + return values.length > 0 ? values : undefined; +} + +function parseOptionalNumber(raw: string | undefined, flag: string): number | undefined { + if (!raw) { + return undefined; + } + const parsed = Number(raw); + if (!Number.isFinite(parsed)) { + throw new Error(`Invalid ${flag} value "${raw}". Expected a number.`); + } + return parsed; +} + +function parseBackends(raw: string | undefined): Phase0BackendTarget[] { + const values = parseCsv(raw) ?? ['pi_embedded', 'native']; + const parsed: Phase0BackendTarget[] = []; + for (const value of values) { + if (BACKEND_TARGETS.includes(value as Phase0BackendTarget)) { + parsed.push(value as Phase0BackendTarget); + continue; + } + throw new Error(`Invalid backend "${value}".`); + } + return parsed; +} + +function parseFormat(raw: string | undefined): OutputFormat { + const format = raw ?? 'markdown'; + if (format !== 'markdown' && format !== 'json') { + throw new Error(`Invalid --format value "${format}".`); + } + return format; +} + +function sortRecordsDesc(records: ArtifactRecord[]): ArtifactRecord[] { + return [...records].sort((a, b) => { + const aTs = a.generatedAtMs ?? 0; + const bTs = b.generatedAtMs ?? 0; + if (aTs !== bTs) { + return bTs - aTs; + } + return b.tag.localeCompare(a.tag); + }); +} + +function formatSignedNumber(value: number | null, suffix = ''): string { + if (value === null || !Number.isFinite(value)) { + return 'n/a'; + } + const rounded = Math.round(value * 100) / 100; + const sign = rounded > 0 ? '+' : ''; + return `${sign}${rounded}${suffix}`; +} + +function formatFreshnessHours(value: number | null): string { + if (value === null || !Number.isFinite(value)) { + return 'n/a'; + } + return `${Math.round(value * 100) / 100}`; +} + +async function writeOutput(pathValue: string, output: string): Promise { + await mkdir(dirname(pathValue), { recursive: true }); + await writeFile(pathValue, `${output}\n`, 'utf8'); +} + +function buildThresholds(values: Record): Phase0BaselineDriftGateThresholds { + return { + requireBaselineHistory: Boolean(values['require-baseline-history']), + minCandidateSampledEvents: parseOptionalNumber(values['min-candidate-sampled-events'] as string | undefined, '--min-candidate-sampled-events'), + minBaselineSampledEvents: parseOptionalNumber(values['min-baseline-sampled-events'] as string | undefined, '--min-baseline-sampled-events'), + maxSampledEventsDropPct: parseOptionalNumber(values['max-sampled-events-drop-pct'] as string | undefined, '--max-sampled-events-drop-pct'), + maxRunOutcomesDropPct: parseOptionalNumber(values['max-run-outcomes-drop-pct'] as string | undefined, '--max-run-outcomes-drop-pct'), + maxCompletionRateDropPp: parseOptionalNumber(values['max-completion-rate-drop-pp'] as string | undefined, '--max-completion-rate-drop-pp'), + maxCancelRateIncreasePp: parseOptionalNumber(values['max-cancel-rate-increase-pp'] as string | undefined, '--max-cancel-rate-increase-pp'), + maxErrorRateIncreasePp: parseOptionalNumber(values['max-error-rate-increase-pp'] as string | undefined, '--max-error-rate-increase-pp'), + maxCancelLatencyP95IncreaseMs: parseOptionalNumber(values['max-cancel-latency-p95-increase-ms'] as string | undefined, '--max-cancel-latency-p95-increase-ms'), + }; +} + +async function readArtifactRecords(artifactsDir: string): Promise { + const files = await readdir(artifactsDir); + const records: ArtifactRecord[] = []; + + for (const file of files) { + const match = ARTIFACT_JSON_PATTERN.exec(file); + if (!match) { + continue; + } + const backend = match[1] as Phase0BackendTarget; + const tag = match[2] ?? ''; + const path = resolve(artifactsDir, file); + const raw = await readFile(path, 'utf8'); + const document = JSON.parse(raw) as Phase0BaselineArtifactDocument; + const generatedAtIso = typeof document.generated_at === 'string' ? document.generated_at : undefined; + const generatedAtMs = generatedAtIso ? Date.parse(generatedAtIso) : NaN; + + records.push({ + backend, + tag, + path, + generatedAtIso, + generatedAtMs: Number.isFinite(generatedAtMs) ? generatedAtMs : undefined, + document, + }); + } + + return records; +} + +function pickCandidate(records: ArtifactRecord[], explicitTag?: string): ArtifactRecord { + if (explicitTag) { + const match = records.find((record) => record.tag === explicitTag); + if (!match) { + throw new Error(`No artifact found for candidate tag "${explicitTag}".`); + } + return match; + } + + const sorted = sortRecordsDesc(records); + const latest = sorted[0]; + if (!latest) { + throw new Error('No candidate artifact found.'); + } + return latest; +} + +function pickBaseline(records: ArtifactRecord[], candidate: ArtifactRecord, explicitBaselineTag?: string): ArtifactRecord | undefined { + if (explicitBaselineTag) { + const match = records.find((record) => record.tag === explicitBaselineTag); + if (!match) { + throw new Error(`No artifact found for baseline tag "${explicitBaselineTag}".`); + } + return match; + } + + const sorted = sortRecordsDesc(records); + for (const record of sorted) { + if (record.path !== candidate.path) { + return record; + } + } + return undefined; +} + +function renderMarkdown( + artifactsDir: string, + backends: Phase0BackendTarget[], + thresholds: Phase0BaselineDriftGateThresholds, + maxAgeHours: number | undefined, + results: BackendDriftResult[], + overallPass: boolean, +): string { + const lines: string[] = []; + lines.push('# Phase-0 Backend Drift Check'); + lines.push(''); + lines.push(`Generated at: ${new Date().toISOString()}`); + lines.push(`Artifacts: ${artifactsDir}`); + lines.push(`Backends: ${backends.join(', ')}`); + if (typeof maxAgeHours === 'number') { + lines.push(`Freshness max age (hours): ${maxAgeHours}`); + } else { + lines.push('Freshness max age (hours): disabled'); + } + lines.push(`Overall gate: ${overallPass ? 'PASS' : 'FAIL'}`); + lines.push(''); + + const thresholdEntries = Object.entries(thresholds).filter(([, value]) => value !== undefined); + lines.push('## Thresholds'); + if (thresholdEntries.length === 0) { + lines.push('- none (report-only mode)'); + } else { + for (const [key, value] of thresholdEntries) { + lines.push(`- ${key}: ${String(value)}`); + } + } + lines.push(''); + + for (const result of results) { + lines.push(`## ${result.backend}`); + lines.push(`- status: ${result.pass ? 'PASS' : 'FAIL'}`); + lines.push(`- candidate: tag=${result.candidate.tag} file=${result.candidate.path}`); + lines.push(`- candidate generated_at: ${result.candidate.generatedAtIso ?? 'n/a'}`); + if (result.baseline) { + lines.push(`- baseline: tag=${result.baseline.tag} file=${result.baseline.path}`); + lines.push(`- baseline generated_at: ${result.baseline.generatedAtIso ?? 'n/a'}`); + } else { + lines.push('- baseline: none'); + } + lines.push(`- candidate snapshot: ${renderPhase0BaselineDriftSnapshot(result.comparison.candidate)}`); + if (result.comparison.baseline) { + lines.push(`- baseline snapshot: ${renderPhase0BaselineDriftSnapshot(result.comparison.baseline)}`); + } + lines.push('- deltas:'); + lines.push(` sampled_event_count_pct=${formatSignedNumber(result.comparison.deltas.sampled_event_count_pct, '%')}`); + lines.push(` run_total_outcomes_pct=${formatSignedNumber(result.comparison.deltas.run_total_outcomes_pct, '%')}`); + lines.push(` completion_rate_pp=${formatSignedNumber(result.comparison.deltas.completion_rate_pp)}`); + lines.push(` cancel_rate_pp=${formatSignedNumber(result.comparison.deltas.cancel_rate_pp)}`); + lines.push(` error_rate_pp=${formatSignedNumber(result.comparison.deltas.error_rate_pp)}`); + lines.push(` cancel_latency_p95_ms=${formatSignedNumber(result.comparison.deltas.cancel_latency_p95_ms)}`); + lines.push(` reaction_match_rate_pp=${formatSignedNumber(result.comparison.deltas.reaction_match_rate_pp)}`); + lines.push(` reaction_skip_rate_pp=${formatSignedNumber(result.comparison.deltas.reaction_skip_rate_pp)}`); + lines.push(`- freshness gate: ${result.freshness.pass ? 'PASS' : 'FAIL'} (age_hours=${formatFreshnessHours(result.freshness.actual_age_hours)} threshold=${result.freshness.threshold_hours ?? 'n/a'})`); + lines.push(`- drift gate: ${result.driftGate.pass ? 'PASS' : 'FAIL'}`); + if (result.driftGate.criteria.length === 0) { + lines.push(' criteria: none'); + } else { + for (const criterion of result.driftGate.criteria) { + lines.push(` ${criterion.pass ? 'PASS' : 'FAIL'} ${criterion.criterion} actual=${criterion.actual} threshold=${criterion.threshold}`); + } + } + lines.push(''); + } + + return lines.join('\n'); +} + +async function main(): Promise { + const { values } = parseArgs({ + options: { + 'artifacts-dir': { type: 'string' }, + backend: { type: 'string' }, + tag: { type: 'string' }, + 'baseline-tag': { type: 'string' }, + 'max-age-hours': { type: 'string' }, + 'require-baseline-history': { type: 'boolean' }, + 'min-candidate-sampled-events': { type: 'string' }, + 'min-baseline-sampled-events': { type: 'string' }, + 'max-sampled-events-drop-pct': { type: 'string' }, + 'max-run-outcomes-drop-pct': { type: 'string' }, + 'max-completion-rate-drop-pp': { type: 'string' }, + 'max-cancel-rate-increase-pp': { type: 'string' }, + 'max-error-rate-increase-pp': { type: 'string' }, + 'max-cancel-latency-p95-increase-ms': { type: 'string' }, + format: { type: 'string' }, + out: { type: 'string' }, + help: { type: 'boolean', short: 'h' }, + }, + strict: true, + allowPositionals: false, + }); + + if (values.help) { + process.stdout.write(`${usage()}\n`); + return; + } + + const artifactsDir = resolve(values['artifacts-dir'] ?? 'docs/plans/artifacts'); + const backends = parseBackends(values.backend); + const candidateTag = values.tag; + const baselineTag = values['baseline-tag']; + const format = parseFormat(values.format); + const maxAgeHours = parseOptionalNumber(values['max-age-hours'], '--max-age-hours'); + if (typeof maxAgeHours === 'number' && maxAgeHours < 0) { + throw new Error('--max-age-hours must be >= 0.'); + } + + const thresholds = buildThresholds(values as Record); + const allRecords = await readArtifactRecords(artifactsDir); + const nowMs = Date.now(); + const results: BackendDriftResult[] = []; + + for (const backend of backends) { + const backendRecords = allRecords.filter((record) => record.backend === backend); + if (backendRecords.length === 0) { + throw new Error(`No backend artifact JSON files found for "${backend}" in ${artifactsDir}.`); + } + + const candidate = pickCandidate(backendRecords, candidateTag); + const baseline = pickBaseline(backendRecords, candidate, baselineTag); + const comparison = comparePhase0BaselineDrift(candidate.document, baseline?.document); + const driftGate = evaluatePhase0BaselineDriftGate(comparison, thresholds); + + const freshness = (() => { + if (typeof maxAgeHours !== 'number') { + return { + enabled: false, + pass: true, + actual_age_hours: null, + threshold_hours: null, + }; + } + if (typeof candidate.generatedAtMs !== 'number') { + return { + enabled: true, + pass: false, + actual_age_hours: null, + threshold_hours: maxAgeHours, + }; + } + const ageHours = Math.max(0, (nowMs - candidate.generatedAtMs) / (1000 * 60 * 60)); + return { + enabled: true, + pass: ageHours <= maxAgeHours, + actual_age_hours: Math.round(ageHours * 100) / 100, + threshold_hours: maxAgeHours, + }; + })(); + + results.push({ + backend, + candidate, + baseline, + comparison, + freshness, + driftGate, + pass: freshness.pass && driftGate.pass, + }); + } + + const overallPass = results.every((result) => result.pass); + const output = format === 'json' + ? JSON.stringify({ + generated_at: new Date().toISOString(), + artifacts_dir: artifactsDir, + backends, + candidate_tag: candidateTag, + baseline_tag: baselineTag, + max_age_hours: maxAgeHours, + thresholds, + overall_pass: overallPass, + results: results.map((result) => ({ + backend: result.backend, + pass: result.pass, + candidate: { + tag: result.candidate.tag, + path: result.candidate.path, + generated_at: result.candidate.generatedAtIso, + }, + baseline: result.baseline + ? { + tag: result.baseline.tag, + path: result.baseline.path, + generated_at: result.baseline.generatedAtIso, + } + : null, + comparison: result.comparison, + freshness: result.freshness, + drift_gate: result.driftGate, + })), + }, null, 2) + : renderMarkdown(artifactsDir, backends, thresholds, maxAgeHours, results, overallPass); + + if (values.out) { + await writeOutput(resolve(values.out), output); + } else { + process.stdout.write(`${output}\n`); + } + + if (!overallPass) { + process.exitCode = 1; + } +} + +main().catch((error) => { + const message = error instanceof Error ? error.message : String(error); + process.stderr.write(`${message}\n\n${usage()}\n`); + process.exitCode = 1; +}); diff --git a/src/audit/phase0BaselineDrift.test.ts b/src/audit/phase0BaselineDrift.test.ts new file mode 100644 index 0000000..c85b827 --- /dev/null +++ b/src/audit/phase0BaselineDrift.test.ts @@ -0,0 +1,340 @@ +import { describe, expect, it } from 'vitest'; +import { + comparePhase0BaselineDrift, + evaluatePhase0BaselineDriftGate, + extractPhase0BaselineDriftSnapshot, +} from './phase0BaselineDrift.js'; + +describe('phase0BaselineDrift', () => { + it('extracts a normalized snapshot from artifact payloads', () => { + const snapshot = extractPhase0BaselineDriftSnapshot({ + source_event_count: 120, + sampled_event_count: 60, + summary: { + event_counts: { + run_state: 0, + run_cancel: 0, + reaction_match: 0, + reaction_skip: 0, + }, + run_outcomes: { + overall: { + total_outcomes: 25, + complete: 20, + cancelled: 3, + error: 2, + cancel_requested: 0, + start: 25, + completion_rate_pct: 80, + cancel_rate_pct: 12, + error_rate_pct: 8, + }, + by_channel: [], + by_session: [], + }, + cancel_latency_ms: { + count: 2, + avg_ms: 120, + p50_ms: 100, + p95_ms: 180, + min_ms: 80, + max_ms: 220, + }, + reactions: { + matched: 10, + skipped: 5, + total: 15, + match_rate_pct: 66.67, + skip_rate_pct: 33.33, + skip_reasons: [], + }, + }, + }); + + expect(snapshot).toEqual({ + source_event_count: 120, + sampled_event_count: 60, + run_total_outcomes: 25, + completion_rate_pct: 80, + cancel_rate_pct: 12, + error_rate_pct: 8, + cancel_latency_p95_ms: 180, + reaction_match_rate_pct: 66.67, + reaction_skip_rate_pct: 33.33, + }); + }); + + it('computes deltas between baseline and candidate artifacts', () => { + const comparison = comparePhase0BaselineDrift( + { + sampled_event_count: 45, + summary: { + event_counts: { + run_state: 0, + run_cancel: 0, + reaction_match: 0, + reaction_skip: 0, + }, + run_outcomes: { + overall: { + total_outcomes: 18, + complete: 15, + cancelled: 2, + error: 1, + cancel_requested: 0, + start: 18, + completion_rate_pct: 83.33, + cancel_rate_pct: 11.11, + error_rate_pct: 5.56, + }, + by_channel: [], + by_session: [], + }, + cancel_latency_ms: { + count: 1, + avg_ms: 50, + p50_ms: 50, + p95_ms: 50, + min_ms: 50, + max_ms: 50, + }, + reactions: { + matched: 3, + skipped: 1, + total: 4, + match_rate_pct: 75, + skip_rate_pct: 25, + skip_reasons: [], + }, + }, + }, + { + sampled_event_count: 60, + summary: { + event_counts: { + run_state: 0, + run_cancel: 0, + reaction_match: 0, + reaction_skip: 0, + }, + run_outcomes: { + overall: { + total_outcomes: 20, + complete: 18, + cancelled: 1, + error: 1, + cancel_requested: 0, + start: 20, + completion_rate_pct: 90, + cancel_rate_pct: 5, + error_rate_pct: 5, + }, + by_channel: [], + by_session: [], + }, + cancel_latency_ms: { + count: 1, + avg_ms: 80, + p50_ms: 80, + p95_ms: 80, + min_ms: 80, + max_ms: 80, + }, + reactions: { + matched: 7, + skipped: 3, + total: 10, + match_rate_pct: 70, + skip_rate_pct: 30, + skip_reasons: [], + }, + }, + }, + ); + + expect(comparison.deltas).toEqual({ + sampled_event_count_pct: -25, + run_total_outcomes_pct: -10, + completion_rate_pp: -6.67, + cancel_rate_pp: 6.11, + error_rate_pp: 0.56, + cancel_latency_p95_ms: -30, + reaction_match_rate_pp: 5, + reaction_skip_rate_pp: -5, + }); + }); + + it('evaluates drift thresholds against candidate deltas', () => { + const comparison = comparePhase0BaselineDrift( + { + sampled_event_count: 80, + summary: { + event_counts: { + run_state: 0, + run_cancel: 0, + reaction_match: 0, + reaction_skip: 0, + }, + run_outcomes: { + overall: { + total_outcomes: 30, + complete: 24, + cancelled: 3, + error: 3, + cancel_requested: 0, + start: 30, + completion_rate_pct: 80, + cancel_rate_pct: 10, + error_rate_pct: 10, + }, + by_channel: [], + by_session: [], + }, + cancel_latency_ms: { + count: 1, + avg_ms: 200, + p50_ms: 200, + p95_ms: 200, + min_ms: 200, + max_ms: 200, + }, + reactions: { + matched: 0, + skipped: 0, + total: 0, + match_rate_pct: null, + skip_rate_pct: null, + skip_reasons: [], + }, + }, + }, + { + sampled_event_count: 100, + summary: { + event_counts: { + run_state: 0, + run_cancel: 0, + reaction_match: 0, + reaction_skip: 0, + }, + run_outcomes: { + overall: { + total_outcomes: 40, + complete: 38, + cancelled: 1, + error: 1, + cancel_requested: 0, + start: 40, + completion_rate_pct: 95, + cancel_rate_pct: 2.5, + error_rate_pct: 2.5, + }, + by_channel: [], + by_session: [], + }, + cancel_latency_ms: { + count: 1, + avg_ms: 120, + p50_ms: 120, + p95_ms: 120, + min_ms: 120, + max_ms: 120, + }, + reactions: { + matched: 0, + skipped: 0, + total: 0, + match_rate_pct: null, + skip_rate_pct: null, + skip_reasons: [], + }, + }, + }, + ); + + const passResult = evaluatePhase0BaselineDriftGate(comparison, { + minCandidateSampledEvents: 50, + minBaselineSampledEvents: 50, + maxSampledEventsDropPct: 30, + maxRunOutcomesDropPct: 30, + maxCompletionRateDropPp: 20, + maxCancelRateIncreasePp: 12, + maxErrorRateIncreasePp: 12, + maxCancelLatencyP95IncreaseMs: 100, + }); + expect(passResult.pass).toBe(true); + expect(passResult.criteria.every((row) => row.pass)).toBe(true); + + const failResult = evaluatePhase0BaselineDriftGate(comparison, { + maxCompletionRateDropPp: 10, + maxCancelRateIncreasePp: 5, + maxErrorRateIncreasePp: 5, + maxCancelLatencyP95IncreaseMs: 50, + }); + expect(failResult.pass).toBe(false); + expect(failResult.criteria.filter((row) => !row.pass).map((row) => row.criterion)).toEqual([ + 'completion_rate_drop_pp', + 'cancel_rate_increase_pp', + 'error_rate_increase_pp', + 'cancel_latency_p95_increase_ms', + ]); + }); + + it('supports missing baseline history and optional strict requirement', () => { + const noBaseline = comparePhase0BaselineDrift({ + sampled_event_count: 12, + summary: { + event_counts: { + run_state: 0, + run_cancel: 0, + reaction_match: 0, + reaction_skip: 0, + }, + run_outcomes: { + overall: { + total_outcomes: 3, + complete: 3, + cancelled: 0, + error: 0, + cancel_requested: 0, + start: 3, + completion_rate_pct: 100, + cancel_rate_pct: 0, + error_rate_pct: 0, + }, + by_channel: [], + by_session: [], + }, + cancel_latency_ms: null, + reactions: { + matched: 0, + skipped: 0, + total: 0, + match_rate_pct: null, + skip_rate_pct: null, + skip_reasons: [], + }, + }, + }); + + const relaxed = evaluatePhase0BaselineDriftGate(noBaseline, { + maxSampledEventsDropPct: 25, + }); + expect(relaxed.pass).toBe(true); + expect(relaxed.criteria[0]).toMatchObject({ + criterion: 'sampled_events_drop_pct', + pass: true, + actual: 'n/a', + }); + + const strict = evaluatePhase0BaselineDriftGate(noBaseline, { + requireBaselineHistory: true, + maxSampledEventsDropPct: 25, + }); + expect(strict.pass).toBe(false); + expect(strict.criteria.filter((row) => !row.pass).map((row) => row.criterion)).toEqual([ + 'baseline_history', + 'sampled_events_drop_pct', + ]); + }); +}); diff --git a/src/audit/phase0BaselineDrift.ts b/src/audit/phase0BaselineDrift.ts new file mode 100644 index 0000000..e1035a0 --- /dev/null +++ b/src/audit/phase0BaselineDrift.ts @@ -0,0 +1,336 @@ +import type { Phase0BaselineSummary } from './phase0BaselineSummary.js'; + +export interface Phase0BaselineArtifactDocument { + generated_at?: string; + source_event_count?: number; + sampled_event_count?: number; + summary?: Phase0BaselineSummary; +} + +export interface Phase0BaselineDriftSnapshot { + source_event_count: number; + sampled_event_count: number; + run_total_outcomes: number; + completion_rate_pct: number | null; + cancel_rate_pct: number | null; + error_rate_pct: number | null; + cancel_latency_p95_ms: number | null; + reaction_match_rate_pct: number | null; + reaction_skip_rate_pct: number | null; +} + +export interface Phase0BaselineDriftDeltas { + sampled_event_count_pct: number | null; + run_total_outcomes_pct: number | null; + completion_rate_pp: number | null; + cancel_rate_pp: number | null; + error_rate_pp: number | null; + cancel_latency_p95_ms: number | null; + reaction_match_rate_pp: number | null; + reaction_skip_rate_pp: number | null; +} + +export interface Phase0BaselineDriftComparison { + baseline: Phase0BaselineDriftSnapshot | null; + candidate: Phase0BaselineDriftSnapshot; + deltas: Phase0BaselineDriftDeltas; +} + +export interface Phase0BaselineDriftGateThresholds { + requireBaselineHistory?: boolean; + minCandidateSampledEvents?: number; + minBaselineSampledEvents?: number; + maxSampledEventsDropPct?: number; + maxRunOutcomesDropPct?: number; + maxCompletionRateDropPp?: number; + maxCancelRateIncreasePp?: number; + maxErrorRateIncreasePp?: number; + maxCancelLatencyP95IncreaseMs?: number; +} + +export interface Phase0BaselineDriftGateCriterion { + criterion: string; + pass: boolean; + actual: string; + threshold: string; +} + +export interface Phase0BaselineDriftGateResult { + pass: boolean; + criteria: Phase0BaselineDriftGateCriterion[]; +} + +function readFiniteNumber(value: unknown): number | undefined { + return typeof value === 'number' && Number.isFinite(value) ? value : undefined; +} + +function readFiniteNumberOrNull(value: unknown): number | null { + const parsed = readFiniteNumber(value); + return typeof parsed === 'number' ? parsed : null; +} + +function readThreshold(value: unknown): number | undefined { + const parsed = readFiniteNumber(value); + if (typeof parsed !== 'number') { + return undefined; + } + return parsed; +} + +function toPctDelta(baseline: number, candidate: number): number | null { + if (!Number.isFinite(baseline) || baseline <= 0 || !Number.isFinite(candidate)) { + return null; + } + return Math.round((((candidate - baseline) / baseline) * 100) * 100) / 100; +} + +function toRateDeltaPp(baseline: number | null, candidate: number | null): number | null { + if (baseline === null || candidate === null) { + return null; + } + return Math.round((candidate - baseline) * 100) / 100; +} + +function formatNumber(value: number | null, suffix = ''): string { + if (value === null || !Number.isFinite(value)) { + return 'n/a'; + } + return `${value}${suffix}`; +} + +export function extractPhase0BaselineDriftSnapshot( + artifact: Phase0BaselineArtifactDocument, +): Phase0BaselineDriftSnapshot { + const summary = artifact.summary; + const runOverall = summary?.run_outcomes?.overall; + const reactions = summary?.reactions; + const cancelLatency = summary?.cancel_latency_ms; + + return { + source_event_count: readFiniteNumber(artifact.source_event_count) ?? 0, + sampled_event_count: readFiniteNumber(artifact.sampled_event_count) ?? 0, + run_total_outcomes: readFiniteNumber(runOverall?.total_outcomes) ?? 0, + completion_rate_pct: readFiniteNumberOrNull(runOverall?.completion_rate_pct), + cancel_rate_pct: readFiniteNumberOrNull(runOverall?.cancel_rate_pct), + error_rate_pct: readFiniteNumberOrNull(runOverall?.error_rate_pct), + cancel_latency_p95_ms: readFiniteNumberOrNull(cancelLatency?.p95_ms), + reaction_match_rate_pct: readFiniteNumberOrNull(reactions?.match_rate_pct), + reaction_skip_rate_pct: readFiniteNumberOrNull(reactions?.skip_rate_pct), + }; +} + +export function comparePhase0BaselineDrift( + candidateArtifact: Phase0BaselineArtifactDocument, + baselineArtifact?: Phase0BaselineArtifactDocument | null, +): Phase0BaselineDriftComparison { + const candidate = extractPhase0BaselineDriftSnapshot(candidateArtifact); + const baseline = baselineArtifact ? extractPhase0BaselineDriftSnapshot(baselineArtifact) : null; + + return { + baseline, + candidate, + deltas: { + sampled_event_count_pct: baseline + ? toPctDelta(baseline.sampled_event_count, candidate.sampled_event_count) + : null, + run_total_outcomes_pct: baseline + ? toPctDelta(baseline.run_total_outcomes, candidate.run_total_outcomes) + : null, + completion_rate_pp: toRateDeltaPp(baseline?.completion_rate_pct ?? null, candidate.completion_rate_pct), + cancel_rate_pp: toRateDeltaPp(baseline?.cancel_rate_pct ?? null, candidate.cancel_rate_pct), + error_rate_pp: toRateDeltaPp(baseline?.error_rate_pct ?? null, candidate.error_rate_pct), + cancel_latency_p95_ms: toRateDeltaPp(baseline?.cancel_latency_p95_ms ?? null, candidate.cancel_latency_p95_ms), + reaction_match_rate_pp: toRateDeltaPp(baseline?.reaction_match_rate_pct ?? null, candidate.reaction_match_rate_pct), + reaction_skip_rate_pp: toRateDeltaPp(baseline?.reaction_skip_rate_pct ?? null, candidate.reaction_skip_rate_pct), + }, + }; +} + +export function evaluatePhase0BaselineDriftGate( + comparison: Phase0BaselineDriftComparison, + thresholds: Phase0BaselineDriftGateThresholds, +): Phase0BaselineDriftGateResult { + const criteria: Phase0BaselineDriftGateCriterion[] = []; + const requireBaselineHistory = Boolean(thresholds.requireBaselineHistory); + const baseline = comparison.baseline; + + if (requireBaselineHistory) { + criteria.push({ + criterion: 'baseline_history', + pass: baseline !== null, + actual: baseline ? 'present' : 'missing', + threshold: 'required', + }); + } + + const minCandidateSampledEvents = readThreshold(thresholds.minCandidateSampledEvents); + if (typeof minCandidateSampledEvents === 'number') { + criteria.push({ + criterion: 'candidate_sampled_events', + pass: comparison.candidate.sampled_event_count >= minCandidateSampledEvents, + actual: String(comparison.candidate.sampled_event_count), + threshold: `>= ${minCandidateSampledEvents}`, + }); + } + + const minBaselineSampledEvents = readThreshold(thresholds.minBaselineSampledEvents); + if (typeof minBaselineSampledEvents === 'number') { + if (!baseline) { + criteria.push({ + criterion: 'baseline_sampled_events', + pass: !requireBaselineHistory, + actual: 'n/a', + threshold: `>= ${minBaselineSampledEvents}`, + }); + } else { + criteria.push({ + criterion: 'baseline_sampled_events', + pass: baseline.sampled_event_count >= minBaselineSampledEvents, + actual: String(baseline.sampled_event_count), + threshold: `>= ${minBaselineSampledEvents}`, + }); + } + } + + const maxSampledEventsDropPct = readThreshold(thresholds.maxSampledEventsDropPct); + if (typeof maxSampledEventsDropPct === 'number') { + const delta = comparison.deltas.sampled_event_count_pct; + if (delta === null) { + criteria.push({ + criterion: 'sampled_events_drop_pct', + pass: !requireBaselineHistory, + actual: 'n/a', + threshold: `<= ${maxSampledEventsDropPct}`, + }); + } else { + const drop = Math.max(0, -delta); + criteria.push({ + criterion: 'sampled_events_drop_pct', + pass: drop <= maxSampledEventsDropPct, + actual: `${Math.round(drop * 100) / 100}`, + threshold: `<= ${maxSampledEventsDropPct}`, + }); + } + } + + const maxRunOutcomesDropPct = readThreshold(thresholds.maxRunOutcomesDropPct); + if (typeof maxRunOutcomesDropPct === 'number') { + const delta = comparison.deltas.run_total_outcomes_pct; + if (delta === null) { + criteria.push({ + criterion: 'run_outcomes_drop_pct', + pass: !requireBaselineHistory, + actual: 'n/a', + threshold: `<= ${maxRunOutcomesDropPct}`, + }); + } else { + const drop = Math.max(0, -delta); + criteria.push({ + criterion: 'run_outcomes_drop_pct', + pass: drop <= maxRunOutcomesDropPct, + actual: `${Math.round(drop * 100) / 100}`, + threshold: `<= ${maxRunOutcomesDropPct}`, + }); + } + } + + const maxCompletionRateDropPp = readThreshold(thresholds.maxCompletionRateDropPp); + if (typeof maxCompletionRateDropPp === 'number') { + const delta = comparison.deltas.completion_rate_pp; + if (delta === null) { + criteria.push({ + criterion: 'completion_rate_drop_pp', + pass: !requireBaselineHistory, + actual: 'n/a', + threshold: `<= ${maxCompletionRateDropPp}`, + }); + } else { + const drop = Math.max(0, -delta); + criteria.push({ + criterion: 'completion_rate_drop_pp', + pass: drop <= maxCompletionRateDropPp, + actual: `${Math.round(drop * 100) / 100}`, + threshold: `<= ${maxCompletionRateDropPp}`, + }); + } + } + + const maxCancelRateIncreasePp = readThreshold(thresholds.maxCancelRateIncreasePp); + if (typeof maxCancelRateIncreasePp === 'number') { + const delta = comparison.deltas.cancel_rate_pp; + if (delta === null) { + criteria.push({ + criterion: 'cancel_rate_increase_pp', + pass: !requireBaselineHistory, + actual: 'n/a', + threshold: `<= ${maxCancelRateIncreasePp}`, + }); + } else { + const increase = Math.max(0, delta); + criteria.push({ + criterion: 'cancel_rate_increase_pp', + pass: increase <= maxCancelRateIncreasePp, + actual: `${Math.round(increase * 100) / 100}`, + threshold: `<= ${maxCancelRateIncreasePp}`, + }); + } + } + + const maxErrorRateIncreasePp = readThreshold(thresholds.maxErrorRateIncreasePp); + if (typeof maxErrorRateIncreasePp === 'number') { + const delta = comparison.deltas.error_rate_pp; + if (delta === null) { + criteria.push({ + criterion: 'error_rate_increase_pp', + pass: !requireBaselineHistory, + actual: 'n/a', + threshold: `<= ${maxErrorRateIncreasePp}`, + }); + } else { + const increase = Math.max(0, delta); + criteria.push({ + criterion: 'error_rate_increase_pp', + pass: increase <= maxErrorRateIncreasePp, + actual: `${Math.round(increase * 100) / 100}`, + threshold: `<= ${maxErrorRateIncreasePp}`, + }); + } + } + + const maxCancelLatencyP95IncreaseMs = readThreshold(thresholds.maxCancelLatencyP95IncreaseMs); + if (typeof maxCancelLatencyP95IncreaseMs === 'number') { + const delta = comparison.deltas.cancel_latency_p95_ms; + if (delta === null) { + criteria.push({ + criterion: 'cancel_latency_p95_increase_ms', + pass: !requireBaselineHistory, + actual: 'n/a', + threshold: `<= ${maxCancelLatencyP95IncreaseMs}`, + }); + } else { + const increase = Math.max(0, delta); + criteria.push({ + criterion: 'cancel_latency_p95_increase_ms', + pass: increase <= maxCancelLatencyP95IncreaseMs, + actual: `${Math.round(increase * 100) / 100}`, + threshold: `<= ${maxCancelLatencyP95IncreaseMs}`, + }); + } + } + + return { + pass: criteria.every((row) => row.pass), + criteria, + }; +} + +export function renderPhase0BaselineDriftSnapshot(snapshot: Phase0BaselineDriftSnapshot): string { + return [ + `sampled=${snapshot.sampled_event_count}`, + `outcomes=${snapshot.run_total_outcomes}`, + `completion=${formatNumber(snapshot.completion_rate_pct, '%')}`, + `cancel=${formatNumber(snapshot.cancel_rate_pct, '%')}`, + `error=${formatNumber(snapshot.error_rate_pct, '%')}`, + `cancel_p95_ms=${formatNumber(snapshot.cancel_latency_p95_ms)}`, + ].join(' '); +}