From e905fe1d56883b356d6279701ab4aab0d3448982 Mon Sep 17 00:00:00 2001 From: William Valentin Date: Fri, 27 Feb 2026 09:05:25 -0800 Subject: [PATCH] feat(audit): persist phase0 backend drift report artifacts --- README.md | 3 + docs/api/PROTOCOL.md | 2 +- docs/architecture/AGENT_DIAGRAM.md | 2 +- .../GATEWAY_SESSIONS_AND_QUEUE.md | 2 +- ...phase0-instrumentation-ticket-checklist.md | 2 +- ...aseline_live_backend_drift_2026-02-27.json | 201 ++++++++++++++++++ ..._baseline_live_backend_drift_2026-02-27.md | 68 ++++++ docs/plans/state.json | 23 +- package.json | 2 +- .../check-phase0-baseline-backend-drift.ts | 100 ++++++--- 10 files changed, 367 insertions(+), 38 deletions(-) create mode 100644 docs/plans/artifacts/phase0_baseline_live_backend_drift_2026-02-27.json create mode 100644 docs/plans/artifacts/phase0_baseline_live_backend_drift_2026-02-27.md diff --git a/README.md b/README.md index 8afeb0e..4418ddd 100644 --- a/README.md +++ b/README.md @@ -1644,6 +1644,9 @@ Backend drift/freshness gate for backend-scoped artifacts (`pi_embedded` vs `nat ```bash pnpm audit:phase0-baseline:live:drift ``` +This command writes drift reports to: +- `docs/plans/artifacts/phase0_baseline_live_backend_drift_.md` +- `docs/plans/artifacts/phase0_baseline_live_backend_drift_.json` Cadence scheduling (example: every 6 hours via host cron) with drift check: ```bash diff --git a/docs/api/PROTOCOL.md b/docs/api/PROTOCOL.md index f59321e..2764e87 100644 --- a/docs/api/PROTOCOL.md +++ b/docs/api/PROTOCOL.md @@ -23,7 +23,7 @@ The gateway provides: - **HTTP Server**: Serves static dashboard and handles webhook endpoints - **Node Capability Negotiation**: Optional companion-node role/capability registration -Operational note: onboarding (`flynn setup` / `flynn onboard`) now runs post-save live readiness checks (model/channel/memory/automation) and prints a guided first-success task flow. Companion CLI now also supports bootstrap-manifest export (`flynn companion --export-bootstrap `), release-bundle export (`--export-release-bundle ` with optional `--signing-key`/`--signing-key-id` signature output), release-bundle verification (`--verify-release-bundle ` with optional `--verify-signing-key`/`--verify-signing-key-id`/`--require-signature`), platform shell-template export (`--export-shell-template `), plus richer shell bootstrap flags for status/location/push (`--app-version`, `--latitude/--longitude`, `--push-token`, etc.) for desktop/mobile app packaging without changing JSON-RPC method/event shapes. Audit observability now includes live phase-0 baseline capture flows: `pnpm audit:phase0-baseline:live` for channel-origin windows, backend-scoped variants (`pnpm audit:phase0-baseline:live:pi` / `pnpm audit:phase0-baseline:live:native`) via `--backend`, `pnpm audit:phase0-baseline:live:gateway` (auto-detected cancel window) for gateway-origin windows, `pnpm audit:phase0-baseline:live:refresh` for one-shot refresh of both windows, and `pnpm audit:phase0-baseline:live:drift` for backend artifact freshness/drift gates. These scripts default to current UTC-date tags unless `--tag` is explicitly provided. +Operational note: onboarding (`flynn setup` / `flynn onboard`) now runs post-save live readiness checks (model/channel/memory/automation) and prints a guided first-success task flow. Companion CLI now also supports bootstrap-manifest export (`flynn companion --export-bootstrap `), release-bundle export (`--export-release-bundle ` with optional `--signing-key`/`--signing-key-id` signature output), release-bundle verification (`--verify-release-bundle ` with optional `--verify-signing-key`/`--verify-signing-key-id`/`--require-signature`), platform shell-template export (`--export-shell-template `), plus richer shell bootstrap flags for status/location/push (`--app-version`, `--latitude/--longitude`, `--push-token`, etc.) for desktop/mobile app packaging without changing JSON-RPC method/event shapes. Audit observability now includes live phase-0 baseline capture flows: `pnpm audit:phase0-baseline:live` for channel-origin windows, backend-scoped variants (`pnpm audit:phase0-baseline:live:pi` / `pnpm audit:phase0-baseline:live:native`) via `--backend`, `pnpm audit:phase0-baseline:live:gateway` (auto-detected cancel window) for gateway-origin windows, `pnpm audit:phase0-baseline:live:refresh` for one-shot refresh of both windows, and `pnpm audit:phase0-baseline:live:drift` for backend artifact freshness/drift gates (writing `phase0_baseline_live_backend_drift_.md/.json` reports). These scripts default to current UTC-date tags unless `--tag` is explicitly provided. ### Execution Model (Sessions + Per-Session Queue) diff --git a/docs/architecture/AGENT_DIAGRAM.md b/docs/architecture/AGENT_DIAGRAM.md index e660bce..c2599f0 100644 --- a/docs/architecture/AGENT_DIAGRAM.md +++ b/docs/architecture/AGENT_DIAGRAM.md @@ -170,7 +170,7 @@ Gateway streaming UX signals: - `pnpm audit:phase0-baseline:live:pi` and `pnpm audit:phase0-baseline:live:native` capture backend-scoped channel windows using `backend.route` timelines. - `pnpm audit:phase0-baseline:live:gateway` captures gateway-origin baseline windows by auto-selecting the latest cancel/cancelled session window (or use `scripts/capture-phase0-live-baseline.ts --source gateway --since ... --until ...` for explicit windows). - `pnpm audit:phase0-baseline:live:refresh` runs both channel + gateway capture commands in one step for cadence refreshes. -- `pnpm audit:phase0-baseline:live:drift` evaluates backend-scoped artifact freshness/drift gates, and `pnpm audit:phase0-baseline:live:refresh:drift` runs capture + drift checks in one cadence step. +- `pnpm audit:phase0-baseline:live:drift` evaluates backend-scoped artifact freshness/drift gates and writes `docs/plans/artifacts/phase0_baseline_live_backend_drift_.md/.json`; `pnpm audit:phase0-baseline:live:refresh:drift` runs capture + drift checks in one cadence step. - `audit:phase0-baseline:live*` scripts are cadence-safe by default (UTC-date tags auto-generated unless explicitly overridden). - Canvas artifacts are persisted by the gateway so session UI surfaces can recover after daemon restarts. - TTS synthesis uses an ordered provider chain with health cooldown tracking; if all providers fail, replies degrade to text-only without dropping the response. diff --git a/docs/architecture/GATEWAY_SESSIONS_AND_QUEUE.md b/docs/architecture/GATEWAY_SESSIONS_AND_QUEUE.md index d0da820..2c8b3e2 100644 --- a/docs/architecture/GATEWAY_SESSIONS_AND_QUEUE.md +++ b/docs/architecture/GATEWAY_SESSIONS_AND_QUEUE.md @@ -35,7 +35,7 @@ If you only want the protocol surface, see `docs/api/PROTOCOL.md`. - Backend-scoped channel snapshots can be regenerated with `pnpm audit:phase0-baseline:live:pi` / `pnpm audit:phase0-baseline:live:native` (`--backend` filtering via `backend.route` timelines). - Gateway-origin phase-0 windows (including cancel-path samples) can be captured with `pnpm audit:phase0-baseline:live:gateway` (auto-detect latest cancel window) or `scripts/capture-phase0-live-baseline.ts --source gateway --since ... --until ...` for explicit bounds. - `pnpm audit:phase0-baseline:live:refresh` runs both capture paths to refresh channel + gateway artifacts in one command. -- `pnpm audit:phase0-baseline:live:drift` checks backend-scoped artifact freshness/drift gates; `pnpm audit:phase0-baseline:live:refresh:drift` chains refresh + drift checks for scheduled cadence runs. +- `pnpm audit:phase0-baseline:live:drift` checks backend-scoped artifact freshness/drift gates and writes `phase0_baseline_live_backend_drift_.md/.json`; `pnpm audit:phase0-baseline:live:refresh:drift` chains refresh + drift checks for scheduled cadence runs. - `audit:phase0-baseline:live*` package scripts now omit fixed tags so scheduled runs automatically roll to current UTC-date artifact tags. - Companion CLI supports one-shot shell bootstrap metadata for live sessions (`--app-version`/`--status-text`, `--latitude`/`--longitude`, `--push-token`) so desktop/mobile wrappers can initialize node status/location/push in a single launch flow. - Canvas artifacts are persisted per session under the gateway data directory for UI recovery across restarts. diff --git a/docs/plans/2026-02-25-phase0-instrumentation-ticket-checklist.md b/docs/plans/2026-02-25-phase0-instrumentation-ticket-checklist.md index 3e37939..edc8c61 100644 --- a/docs/plans/2026-02-25-phase0-instrumentation-ticket-checklist.md +++ b/docs/plans/2026-02-25-phase0-instrumentation-ticket-checklist.md @@ -203,7 +203,7 @@ Phase 0 is complete when: 2. A baseline summary artifact is generated and committed under `docs/plans/artifacts/`. 3. No user-visible response behavior changed compared to pre-phase baseline. -Follow-up status (2026-02-27): live channel-session artifacts exist under `docs/plans/artifacts/phase0_baseline_live_2026-02-27.*` via `pnpm audit:phase0-baseline:live` (anonymized IDs), and a second gateway-origin live window (including `run.cancel` + `cancel_requested`/`cancelled`) exists under `docs/plans/artifacts/phase0_baseline_live_gateway_2026-02-27.*`. Gateway window refreshes can now run via `pnpm audit:phase0-baseline:live:gateway` (auto-selected cancel window), both windows can be refreshed together with `pnpm audit:phase0-baseline:live:refresh` (scheduling example included in README), backend-scoped channel windows are now available via `pnpm audit:phase0-baseline:live:pi` / `pnpm audit:phase0-baseline:live:native`, and backend artifact freshness/drift checks are now available via `pnpm audit:phase0-baseline:live:drift` (or chained with `pnpm audit:phase0-baseline:live:refresh:drift`). +Follow-up status (2026-02-27): live channel-session artifacts exist under `docs/plans/artifacts/phase0_baseline_live_2026-02-27.*` via `pnpm audit:phase0-baseline:live` (anonymized IDs), and a second gateway-origin live window (including `run.cancel` + `cancel_requested`/`cancelled`) exists under `docs/plans/artifacts/phase0_baseline_live_gateway_2026-02-27.*`. Gateway window refreshes can now run via `pnpm audit:phase0-baseline:live:gateway` (auto-selected cancel window), both windows can be refreshed together with `pnpm audit:phase0-baseline:live:refresh` (scheduling example included in README), backend-scoped channel windows are now available via `pnpm audit:phase0-baseline:live:pi` / `pnpm audit:phase0-baseline:live:native`, and backend artifact freshness/drift checks are now available via `pnpm audit:phase0-baseline:live:drift` (or chained with `pnpm audit:phase0-baseline:live:refresh:drift`) with drift report artifacts written to `docs/plans/artifacts/phase0_baseline_live_backend_drift_.{md,json}`. ## Subagent Model Assignment Plan diff --git a/docs/plans/artifacts/phase0_baseline_live_backend_drift_2026-02-27.json b/docs/plans/artifacts/phase0_baseline_live_backend_drift_2026-02-27.json new file mode 100644 index 0000000..66177aa --- /dev/null +++ b/docs/plans/artifacts/phase0_baseline_live_backend_drift_2026-02-27.json @@ -0,0 +1,201 @@ +{ + "generated_at": "2026-02-27T17:04:49.009Z", + "artifacts_dir": "/home/will/lab/flynn/docs/plans/artifacts", + "backends": [ + "pi_embedded", + "native" + ], + "report_tag": "2026-02-27", + "max_age_hours": 36, + "thresholds": { + "requireBaselineHistory": false, + "minCandidateSampledEvents": 10, + "maxSampledEventsDropPct": 80, + "maxRunOutcomesDropPct": 80, + "maxCompletionRateDropPp": 35, + "maxCancelRateIncreasePp": 25, + "maxErrorRateIncreasePp": 25, + "maxCancelLatencyP95IncreaseMs": 6000 + }, + "overall_pass": true, + "reports": { + "summary_json_out": "/home/will/lab/flynn/docs/plans/artifacts/phase0_baseline_live_backend_drift_2026-02-27.json", + "summary_md_out": "/home/will/lab/flynn/docs/plans/artifacts/phase0_baseline_live_backend_drift_2026-02-27.md" + }, + "results": [ + { + "backend": "pi_embedded", + "pass": true, + "candidate": { + "tag": "2026-02-27", + "path": "/home/will/lab/flynn/docs/plans/artifacts/phase0_baseline_live_backend_pi_embedded_2026-02-27.json", + "generated_at": "2026-02-27T16:45:18.488Z" + }, + "baseline": null, + "comparison": { + "baseline": null, + "candidate": { + "source_event_count": 110, + "sampled_event_count": 56, + "run_total_outcomes": 25, + "completion_rate_pct": 100, + "cancel_rate_pct": 0, + "error_rate_pct": 0, + "cancel_latency_p95_ms": null, + "reaction_match_rate_pct": 0, + "reaction_skip_rate_pct": 100 + }, + "deltas": { + "sampled_event_count_pct": null, + "run_total_outcomes_pct": null, + "completion_rate_pp": null, + "cancel_rate_pp": null, + "error_rate_pp": null, + "cancel_latency_p95_ms": null, + "reaction_match_rate_pp": null, + "reaction_skip_rate_pp": null + } + }, + "freshness": { + "enabled": true, + "pass": true, + "actual_age_hours": 0.33, + "threshold_hours": 36 + }, + "drift_gate": { + "pass": true, + "criteria": [ + { + "criterion": "candidate_sampled_events", + "pass": true, + "actual": "56", + "threshold": ">= 10" + }, + { + "criterion": "sampled_events_drop_pct", + "pass": true, + "actual": "n/a", + "threshold": "<= 80" + }, + { + "criterion": "run_outcomes_drop_pct", + "pass": true, + "actual": "n/a", + "threshold": "<= 80" + }, + { + "criterion": "completion_rate_drop_pp", + "pass": true, + "actual": "n/a", + "threshold": "<= 35" + }, + { + "criterion": "cancel_rate_increase_pp", + "pass": true, + "actual": "n/a", + "threshold": "<= 25" + }, + { + "criterion": "error_rate_increase_pp", + "pass": true, + "actual": "n/a", + "threshold": "<= 25" + }, + { + "criterion": "cancel_latency_p95_increase_ms", + "pass": true, + "actual": "n/a", + "threshold": "<= 6000" + } + ] + } + }, + { + "backend": "native", + "pass": true, + "candidate": { + "tag": "2026-02-27", + "path": "/home/will/lab/flynn/docs/plans/artifacts/phase0_baseline_live_backend_native_2026-02-27.json", + "generated_at": "2026-02-27T16:45:18.490Z" + }, + "baseline": null, + "comparison": { + "baseline": null, + "candidate": { + "source_event_count": 110, + "sampled_event_count": 13, + "run_total_outcomes": 2, + "completion_rate_pct": 100, + "cancel_rate_pct": 0, + "error_rate_pct": 0, + "cancel_latency_p95_ms": null, + "reaction_match_rate_pct": null, + "reaction_skip_rate_pct": null + }, + "deltas": { + "sampled_event_count_pct": null, + "run_total_outcomes_pct": null, + "completion_rate_pp": null, + "cancel_rate_pp": null, + "error_rate_pp": null, + "cancel_latency_p95_ms": null, + "reaction_match_rate_pp": null, + "reaction_skip_rate_pp": null + } + }, + "freshness": { + "enabled": true, + "pass": true, + "actual_age_hours": 0.33, + "threshold_hours": 36 + }, + "drift_gate": { + "pass": true, + "criteria": [ + { + "criterion": "candidate_sampled_events", + "pass": true, + "actual": "13", + "threshold": ">= 10" + }, + { + "criterion": "sampled_events_drop_pct", + "pass": true, + "actual": "n/a", + "threshold": "<= 80" + }, + { + "criterion": "run_outcomes_drop_pct", + "pass": true, + "actual": "n/a", + "threshold": "<= 80" + }, + { + "criterion": "completion_rate_drop_pp", + "pass": true, + "actual": "n/a", + "threshold": "<= 35" + }, + { + "criterion": "cancel_rate_increase_pp", + "pass": true, + "actual": "n/a", + "threshold": "<= 25" + }, + { + "criterion": "error_rate_increase_pp", + "pass": true, + "actual": "n/a", + "threshold": "<= 25" + }, + { + "criterion": "cancel_latency_p95_increase_ms", + "pass": true, + "actual": "n/a", + "threshold": "<= 6000" + } + ] + } + } + ] +} diff --git a/docs/plans/artifacts/phase0_baseline_live_backend_drift_2026-02-27.md b/docs/plans/artifacts/phase0_baseline_live_backend_drift_2026-02-27.md new file mode 100644 index 0000000..93cdc14 --- /dev/null +++ b/docs/plans/artifacts/phase0_baseline_live_backend_drift_2026-02-27.md @@ -0,0 +1,68 @@ +# Phase-0 Backend Drift Check + +Generated at: 2026-02-27T17:04:49.009Z +Artifacts: /home/will/lab/flynn/docs/plans/artifacts +Backends: pi_embedded, native +Freshness max age (hours): 36 +Overall gate: PASS + +## Thresholds +- requireBaselineHistory: false +- minCandidateSampledEvents: 10 +- maxSampledEventsDropPct: 80 +- maxRunOutcomesDropPct: 80 +- maxCompletionRateDropPp: 35 +- maxCancelRateIncreasePp: 25 +- maxErrorRateIncreasePp: 25 +- maxCancelLatencyP95IncreaseMs: 6000 + +## pi_embedded +- status: PASS +- candidate: tag=2026-02-27 file=/home/will/lab/flynn/docs/plans/artifacts/phase0_baseline_live_backend_pi_embedded_2026-02-27.json +- candidate generated_at: 2026-02-27T16:45:18.488Z +- baseline: none +- candidate snapshot: sampled=56 outcomes=25 completion=100% cancel=0% error=0% cancel_p95_ms=n/a +- deltas: + sampled_event_count_pct=n/a + run_total_outcomes_pct=n/a + completion_rate_pp=n/a + cancel_rate_pp=n/a + error_rate_pp=n/a + cancel_latency_p95_ms=n/a + reaction_match_rate_pp=n/a + reaction_skip_rate_pp=n/a +- freshness gate: PASS (age_hours=0.33 threshold=36) +- drift gate: PASS + PASS candidate_sampled_events actual=56 threshold=>= 10 + PASS sampled_events_drop_pct actual=n/a threshold=<= 80 + PASS run_outcomes_drop_pct actual=n/a threshold=<= 80 + PASS completion_rate_drop_pp actual=n/a threshold=<= 35 + PASS cancel_rate_increase_pp actual=n/a threshold=<= 25 + PASS error_rate_increase_pp actual=n/a threshold=<= 25 + PASS cancel_latency_p95_increase_ms actual=n/a threshold=<= 6000 + +## native +- status: PASS +- candidate: tag=2026-02-27 file=/home/will/lab/flynn/docs/plans/artifacts/phase0_baseline_live_backend_native_2026-02-27.json +- candidate generated_at: 2026-02-27T16:45:18.490Z +- baseline: none +- candidate snapshot: sampled=13 outcomes=2 completion=100% cancel=0% error=0% cancel_p95_ms=n/a +- deltas: + sampled_event_count_pct=n/a + run_total_outcomes_pct=n/a + completion_rate_pp=n/a + cancel_rate_pp=n/a + error_rate_pp=n/a + cancel_latency_p95_ms=n/a + reaction_match_rate_pp=n/a + reaction_skip_rate_pp=n/a +- freshness gate: PASS (age_hours=0.33 threshold=36) +- drift gate: PASS + PASS candidate_sampled_events actual=13 threshold=>= 10 + PASS sampled_events_drop_pct actual=n/a threshold=<= 80 + PASS run_outcomes_drop_pct actual=n/a threshold=<= 80 + PASS completion_rate_drop_pp actual=n/a threshold=<= 35 + PASS cancel_rate_increase_pp actual=n/a threshold=<= 25 + PASS error_rate_increase_pp actual=n/a threshold=<= 25 + PASS cancel_latency_p95_increase_ms actual=n/a threshold=<= 6000 + diff --git a/docs/plans/state.json b/docs/plans/state.json index 78aa33c..ac4cb46 100644 --- a/docs/plans/state.json +++ b/docs/plans/state.json @@ -215,6 +215,25 @@ ], "test_status": "pnpm test:run src/audit/phase0BaselineDrift.test.ts + pnpm audit:phase0-baseline:live:drift + pnpm typecheck passing" }, + "phase0-live-baseline-backend-drift-artifacts": { + "status": "completed", + "date": "2026-02-27", + "updated": "2026-02-27", + "summary": "Extended backend drift checks to persist cadence artifacts by default (`phase0_baseline_live_backend_drift_.md/.json`) via `--write-default-artifacts`, wired package scripts accordingly, and generated the first drift artifact set.", + "files_modified": [ + "scripts/check-phase0-baseline-backend-drift.ts", + "package.json", + "README.md", + "docs/api/PROTOCOL.md", + "docs/architecture/AGENT_DIAGRAM.md", + "docs/architecture/GATEWAY_SESSIONS_AND_QUEUE.md", + "docs/plans/2026-02-25-phase0-instrumentation-ticket-checklist.md", + "docs/plans/artifacts/phase0_baseline_live_backend_drift_2026-02-27.md", + "docs/plans/artifacts/phase0_baseline_live_backend_drift_2026-02-27.json", + "docs/plans/state.json" + ], + "test_status": "pnpm audit:phase0-baseline:live:drift + pnpm test:run src/audit/phase0BaselineDrift.test.ts + pnpm typecheck passing" + }, "phase0-instrumentation-ticket-checklist": { "status": "completed", "date": "2026-02-25", @@ -7400,8 +7419,8 @@ "deeper_surfaces_phase0_ticket_02": "completed — gateway + daemon routing emit run lifecycle/cancel telemetry and reaction match/skip audit events with filter summaries and cancellation latency, plus focused tests", "deeper_surfaces_phase0_ticket_03": "completed — gateway metrics now track run-state outcomes, cancel latency samples, and reaction decision counters with routing/gateway emitters", "deeper_surfaces_phase0_ticket_04": "completed — added phase-0 baseline summary tooling for run outcomes, cancel latency, and reaction decisions with markdown/json CLI output", - "deeper_surfaces_phase0_ticket_05": "completed — documented phase-0 telemetry fields/workflow, refreshed architecture/protocol docs, generated anonymized live baseline artifacts for channel/gateway/backend-scoped (pi/native) windows, and added backend artifact freshness/drift gates (`pnpm audit:phase0-baseline:live:drift`)", - "next_up": "Run scheduled `pnpm audit:phase0-baseline:live:refresh:drift` in each active environment and observe at least one full cadence cycle before tightening drift thresholds or changing additional run-control/reaction semantics.", + "deeper_surfaces_phase0_ticket_05": "completed — documented phase-0 telemetry fields/workflow, refreshed architecture/protocol docs, generated anonymized live baseline artifacts for channel/gateway/backend-scoped (pi/native) windows, and added backend artifact freshness/drift gates with persisted drift reports (`phase0_baseline_live_backend_drift_.{md,json}`)", + "next_up": "Run scheduled `pnpm audit:phase0-baseline:live:refresh:drift` in each active environment and collect at least one additional UTC-date drift artifact so baseline-vs-prior comparisons become active before tightening thresholds or changing additional run-control/reaction semantics.", "pi_embedded_canary_spike": "completed — added optional pi_embedded backend adapter, canary-safe no-tools routing guard, backend success/fallback latency audit events, and docs/diagram updates while native remains default", "pi_embedded_evaluation_phase": "completed — final decision rollback (applied in runtime config): Window A failed latency/fallback gates (p50 +259ms, p95 +5695ms, fallback 25%, categories: pi_module_interface/empty_assistant_text); Window B remained sample-insufficient; controlled probes verified guard coverage (pi_no_tools_mode/capability_query/attachments_present each hit once)", "pi_embedded_manual_mode": "completed — added persisted runtime backend controls for manual Pi activation/deactivation (`/runtime` preferred, `/backend` alias; `status`, `activate pi`, `deactivate pi`, `use config`) while keeping config-driven default routing", diff --git a/package.json b/package.json index 6d8d99a..b00b5e7 100644 --- a/package.json +++ b/package.json @@ -27,7 +27,7 @@ "audit:phase0-baseline:live:native": "node --import tsx/esm scripts/capture-phase0-live-baseline.ts --audit ~/.local/share/flynn/audit.log --source channel --backend native --exclude-session-substring probe", "audit:phase0-baseline:live:gateway": "node --import tsx/esm scripts/capture-phase0-live-baseline.ts --audit ~/.local/share/flynn/audit.log --source gateway --auto-gateway-cancel-window", "audit:phase0-baseline:live:refresh": "pnpm audit:phase0-baseline:live && pnpm audit:phase0-baseline:live:gateway", - "audit:phase0-baseline:live:drift": "node --import tsx/esm scripts/check-phase0-baseline-backend-drift.ts --artifacts-dir docs/plans/artifacts --backend pi_embedded,native --max-age-hours 36 --min-candidate-sampled-events 10 --max-sampled-events-drop-pct 80 --max-run-outcomes-drop-pct 80 --max-completion-rate-drop-pp 35 --max-cancel-rate-increase-pp 25 --max-error-rate-increase-pp 25 --max-cancel-latency-p95-increase-ms 6000", + "audit:phase0-baseline:live:drift": "node --import tsx/esm scripts/check-phase0-baseline-backend-drift.ts --artifacts-dir docs/plans/artifacts --backend pi_embedded,native --max-age-hours 36 --min-candidate-sampled-events 10 --max-sampled-events-drop-pct 80 --max-run-outcomes-drop-pct 80 --max-completion-rate-drop-pp 35 --max-cancel-rate-increase-pp 25 --max-error-rate-increase-pp 25 --max-cancel-latency-p95-increase-ms 6000 --write-default-artifacts", "audit:phase0-baseline:live:refresh:drift": "pnpm audit:phase0-baseline:live:refresh && pnpm audit:phase0-baseline:live:drift", "audit:backend-canary:probes": "node --import tsx/esm scripts/run-pi-canary-guard-probes.ts", "companion:bundle": "node --import tsx/esm scripts/build-companion-release-bundle.ts", diff --git a/scripts/check-phase0-baseline-backend-drift.ts b/scripts/check-phase0-baseline-backend-drift.ts index 777ecaf..a29e443 100644 --- a/scripts/check-phase0-baseline-backend-drift.ts +++ b/scripts/check-phase0-baseline-backend-drift.ts @@ -61,6 +61,10 @@ function usage(): string { ' --baseline-tag Baseline artifact tag (default: previous available per backend)', ' --max-age-hours Require candidate artifact freshness (optional)', ' --require-baseline-history Fail when no prior artifact exists', + ' --report-tag Drift report tag (default: current UTC date)', + ' --write-default-artifacts Write markdown/json drift reports under artifacts dir', + ' --summary-json-out Write JSON report to path', + ' --summary-md-out Write Markdown report to path', ' --format Output format (default: markdown)', ' --out Write output to file instead of stdout', '', @@ -76,6 +80,10 @@ function usage(): string { ].join('\n'); } +function isoDateTagNow(): string { + return new Date().toISOString().slice(0, 10); +} + function parseCsv(value: string | undefined): string[] | undefined { if (!value) { return undefined; @@ -311,6 +319,10 @@ async function main(): Promise { 'baseline-tag': { type: 'string' }, 'max-age-hours': { type: 'string' }, 'require-baseline-history': { type: 'boolean' }, + 'report-tag': { type: 'string' }, + 'write-default-artifacts': { type: 'boolean' }, + 'summary-json-out': { type: 'string' }, + 'summary-md-out': { type: 'string' }, 'min-candidate-sampled-events': { type: 'string' }, 'min-baseline-sampled-events': { type: 'string' }, 'max-sampled-events-drop-pct': { type: 'string' }, @@ -337,11 +349,25 @@ async function main(): Promise { const candidateTag = values.tag; const baselineTag = values['baseline-tag']; const format = parseFormat(values.format); + const reportTag = values['report-tag'] ?? isoDateTagNow(); + const writeDefaultArtifacts = Boolean(values['write-default-artifacts']); const maxAgeHours = parseOptionalNumber(values['max-age-hours'], '--max-age-hours'); if (typeof maxAgeHours === 'number' && maxAgeHours < 0) { throw new Error('--max-age-hours must be >= 0.'); } + const defaultBaseName = resolve(artifactsDir, `phase0_baseline_live_backend_drift_${reportTag}`); + const summaryJsonOut = values['summary-json-out'] + ? resolve(values['summary-json-out']) + : writeDefaultArtifacts + ? `${defaultBaseName}.json` + : undefined; + const summaryMdOut = values['summary-md-out'] + ? resolve(values['summary-md-out']) + : writeDefaultArtifacts + ? `${defaultBaseName}.md` + : undefined; + const thresholds = buildThresholds(values as Record); const allRecords = await readArtifactRecords(artifactsDir); const nowMs = Date.now(); @@ -396,37 +422,49 @@ async function main(): Promise { } const overallPass = results.every((result) => result.pass); - const output = format === 'json' - ? JSON.stringify({ - generated_at: new Date().toISOString(), - artifacts_dir: artifactsDir, - backends, - candidate_tag: candidateTag, - baseline_tag: baselineTag, - max_age_hours: maxAgeHours, - thresholds, - overall_pass: overallPass, - results: results.map((result) => ({ - backend: result.backend, - pass: result.pass, - candidate: { - tag: result.candidate.tag, - path: result.candidate.path, - generated_at: result.candidate.generatedAtIso, - }, - baseline: result.baseline - ? { - tag: result.baseline.tag, - path: result.baseline.path, - generated_at: result.baseline.generatedAtIso, - } - : null, - comparison: result.comparison, - freshness: result.freshness, - drift_gate: result.driftGate, - })), - }, null, 2) - : renderMarkdown(artifactsDir, backends, thresholds, maxAgeHours, results, overallPass); + const jsonOutput = JSON.stringify({ + generated_at: new Date().toISOString(), + artifacts_dir: artifactsDir, + backends, + candidate_tag: candidateTag, + baseline_tag: baselineTag, + report_tag: reportTag, + max_age_hours: maxAgeHours, + thresholds, + overall_pass: overallPass, + reports: { + summary_json_out: summaryJsonOut, + summary_md_out: summaryMdOut, + }, + results: results.map((result) => ({ + backend: result.backend, + pass: result.pass, + candidate: { + tag: result.candidate.tag, + path: result.candidate.path, + generated_at: result.candidate.generatedAtIso, + }, + baseline: result.baseline + ? { + tag: result.baseline.tag, + path: result.baseline.path, + generated_at: result.baseline.generatedAtIso, + } + : null, + comparison: result.comparison, + freshness: result.freshness, + drift_gate: result.driftGate, + })), + }, null, 2); + const markdownOutput = renderMarkdown(artifactsDir, backends, thresholds, maxAgeHours, results, overallPass); + const output = format === 'json' ? jsonOutput : markdownOutput; + + if (summaryJsonOut) { + await writeOutput(summaryJsonOut, jsonOutput); + } + if (summaryMdOut) { + await writeOutput(summaryMdOut, markdownOutput); + } if (values.out) { await writeOutput(resolve(values.out), output);