feat(audit): add phase0 backend drift and freshness gates
This commit is contained in:
@@ -1640,9 +1640,14 @@ One-shot refresh for both channel + gateway live windows:
|
|||||||
pnpm audit:phase0-baseline:live:refresh
|
pnpm audit:phase0-baseline:live:refresh
|
||||||
```
|
```
|
||||||
|
|
||||||
Cadence scheduling (example: every 6 hours via host cron):
|
Backend drift/freshness gate for backend-scoped artifacts (`pi_embedded` vs `native`):
|
||||||
```bash
|
```bash
|
||||||
0 */6 * * * cd /path/to/flynn && pnpm audit:phase0-baseline:live:refresh >> ~/.local/share/flynn/phase0_baseline_refresh.log 2>&1
|
pnpm audit:phase0-baseline:live:drift
|
||||||
|
```
|
||||||
|
|
||||||
|
Cadence scheduling (example: every 6 hours via host cron) with drift check:
|
||||||
|
```bash
|
||||||
|
0 */6 * * * cd /path/to/flynn && pnpm audit:phase0-baseline:live:refresh:drift >> ~/.local/share/flynn/phase0_baseline_refresh.log 2>&1
|
||||||
```
|
```
|
||||||
`audit:phase0-baseline:live*` scripts now default to the current UTC date tag when `--tag` is omitted.
|
`audit:phase0-baseline:live*` scripts now default to the current UTC date tag when `--tag` is omitted.
|
||||||
|
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ The gateway provides:
|
|||||||
- **HTTP Server**: Serves static dashboard and handles webhook endpoints
|
- **HTTP Server**: Serves static dashboard and handles webhook endpoints
|
||||||
- **Node Capability Negotiation**: Optional companion-node role/capability registration
|
- **Node Capability Negotiation**: Optional companion-node role/capability registration
|
||||||
|
|
||||||
Operational note: onboarding (`flynn setup` / `flynn onboard`) now runs post-save live readiness checks (model/channel/memory/automation) and prints a guided first-success task flow. Companion CLI now also supports bootstrap-manifest export (`flynn companion --export-bootstrap <path|->`), release-bundle export (`--export-release-bundle <dir>` with optional `--signing-key`/`--signing-key-id` signature output), release-bundle verification (`--verify-release-bundle <dir>` with optional `--verify-signing-key`/`--verify-signing-key-id`/`--require-signature`), platform shell-template export (`--export-shell-template <dir>`), plus richer shell bootstrap flags for status/location/push (`--app-version`, `--latitude/--longitude`, `--push-token`, etc.) for desktop/mobile app packaging without changing JSON-RPC method/event shapes. Audit observability now includes live phase-0 baseline capture flows: `pnpm audit:phase0-baseline:live` for channel-origin windows, backend-scoped variants (`pnpm audit:phase0-baseline:live:pi` / `pnpm audit:phase0-baseline:live:native`) via `--backend`, `pnpm audit:phase0-baseline:live:gateway` (auto-detected cancel window) for gateway-origin windows, and `pnpm audit:phase0-baseline:live:refresh` for one-shot refresh of both windows. These scripts default to current UTC-date tags unless `--tag` is explicitly provided.
|
Operational note: onboarding (`flynn setup` / `flynn onboard`) now runs post-save live readiness checks (model/channel/memory/automation) and prints a guided first-success task flow. Companion CLI now also supports bootstrap-manifest export (`flynn companion --export-bootstrap <path|->`), release-bundle export (`--export-release-bundle <dir>` with optional `--signing-key`/`--signing-key-id` signature output), release-bundle verification (`--verify-release-bundle <dir>` with optional `--verify-signing-key`/`--verify-signing-key-id`/`--require-signature`), platform shell-template export (`--export-shell-template <dir>`), plus richer shell bootstrap flags for status/location/push (`--app-version`, `--latitude/--longitude`, `--push-token`, etc.) for desktop/mobile app packaging without changing JSON-RPC method/event shapes. Audit observability now includes live phase-0 baseline capture flows: `pnpm audit:phase0-baseline:live` for channel-origin windows, backend-scoped variants (`pnpm audit:phase0-baseline:live:pi` / `pnpm audit:phase0-baseline:live:native`) via `--backend`, `pnpm audit:phase0-baseline:live:gateway` (auto-detected cancel window) for gateway-origin windows, `pnpm audit:phase0-baseline:live:refresh` for one-shot refresh of both windows, and `pnpm audit:phase0-baseline:live:drift` for backend artifact freshness/drift gates. These scripts default to current UTC-date tags unless `--tag` is explicitly provided.
|
||||||
|
|
||||||
### Execution Model (Sessions + Per-Session Queue)
|
### Execution Model (Sessions + Per-Session Queue)
|
||||||
|
|
||||||
|
|||||||
@@ -170,6 +170,7 @@ Gateway streaming UX signals:
|
|||||||
- `pnpm audit:phase0-baseline:live:pi` and `pnpm audit:phase0-baseline:live:native` capture backend-scoped channel windows using `backend.route` timelines.
|
- `pnpm audit:phase0-baseline:live:pi` and `pnpm audit:phase0-baseline:live:native` capture backend-scoped channel windows using `backend.route` timelines.
|
||||||
- `pnpm audit:phase0-baseline:live:gateway` captures gateway-origin baseline windows by auto-selecting the latest cancel/cancelled session window (or use `scripts/capture-phase0-live-baseline.ts --source gateway --since ... --until ...` for explicit windows).
|
- `pnpm audit:phase0-baseline:live:gateway` captures gateway-origin baseline windows by auto-selecting the latest cancel/cancelled session window (or use `scripts/capture-phase0-live-baseline.ts --source gateway --since ... --until ...` for explicit windows).
|
||||||
- `pnpm audit:phase0-baseline:live:refresh` runs both channel + gateway capture commands in one step for cadence refreshes.
|
- `pnpm audit:phase0-baseline:live:refresh` runs both channel + gateway capture commands in one step for cadence refreshes.
|
||||||
|
- `pnpm audit:phase0-baseline:live:drift` evaluates backend-scoped artifact freshness/drift gates, and `pnpm audit:phase0-baseline:live:refresh:drift` runs capture + drift checks in one cadence step.
|
||||||
- `audit:phase0-baseline:live*` scripts are cadence-safe by default (UTC-date tags auto-generated unless explicitly overridden).
|
- `audit:phase0-baseline:live*` scripts are cadence-safe by default (UTC-date tags auto-generated unless explicitly overridden).
|
||||||
- Canvas artifacts are persisted by the gateway so session UI surfaces can recover after daemon restarts.
|
- Canvas artifacts are persisted by the gateway so session UI surfaces can recover after daemon restarts.
|
||||||
- TTS synthesis uses an ordered provider chain with health cooldown tracking; if all providers fail, replies degrade to text-only without dropping the response.
|
- TTS synthesis uses an ordered provider chain with health cooldown tracking; if all providers fail, replies degrade to text-only without dropping the response.
|
||||||
|
|||||||
@@ -35,6 +35,7 @@ If you only want the protocol surface, see `docs/api/PROTOCOL.md`.
|
|||||||
- Backend-scoped channel snapshots can be regenerated with `pnpm audit:phase0-baseline:live:pi` / `pnpm audit:phase0-baseline:live:native` (`--backend` filtering via `backend.route` timelines).
|
- Backend-scoped channel snapshots can be regenerated with `pnpm audit:phase0-baseline:live:pi` / `pnpm audit:phase0-baseline:live:native` (`--backend` filtering via `backend.route` timelines).
|
||||||
- Gateway-origin phase-0 windows (including cancel-path samples) can be captured with `pnpm audit:phase0-baseline:live:gateway` (auto-detect latest cancel window) or `scripts/capture-phase0-live-baseline.ts --source gateway --since ... --until ...` for explicit bounds.
|
- Gateway-origin phase-0 windows (including cancel-path samples) can be captured with `pnpm audit:phase0-baseline:live:gateway` (auto-detect latest cancel window) or `scripts/capture-phase0-live-baseline.ts --source gateway --since ... --until ...` for explicit bounds.
|
||||||
- `pnpm audit:phase0-baseline:live:refresh` runs both capture paths to refresh channel + gateway artifacts in one command.
|
- `pnpm audit:phase0-baseline:live:refresh` runs both capture paths to refresh channel + gateway artifacts in one command.
|
||||||
|
- `pnpm audit:phase0-baseline:live:drift` checks backend-scoped artifact freshness/drift gates; `pnpm audit:phase0-baseline:live:refresh:drift` chains refresh + drift checks for scheduled cadence runs.
|
||||||
- `audit:phase0-baseline:live*` package scripts now omit fixed tags so scheduled runs automatically roll to current UTC-date artifact tags.
|
- `audit:phase0-baseline:live*` package scripts now omit fixed tags so scheduled runs automatically roll to current UTC-date artifact tags.
|
||||||
- Companion CLI supports one-shot shell bootstrap metadata for live sessions (`--app-version`/`--status-text`, `--latitude`/`--longitude`, `--push-token`) so desktop/mobile wrappers can initialize node status/location/push in a single launch flow.
|
- Companion CLI supports one-shot shell bootstrap metadata for live sessions (`--app-version`/`--status-text`, `--latitude`/`--longitude`, `--push-token`) so desktop/mobile wrappers can initialize node status/location/push in a single launch flow.
|
||||||
- Canvas artifacts are persisted per session under the gateway data directory for UI recovery across restarts.
|
- Canvas artifacts are persisted per session under the gateway data directory for UI recovery across restarts.
|
||||||
|
|||||||
@@ -203,7 +203,7 @@ Phase 0 is complete when:
|
|||||||
2. A baseline summary artifact is generated and committed under `docs/plans/artifacts/`.
|
2. A baseline summary artifact is generated and committed under `docs/plans/artifacts/`.
|
||||||
3. No user-visible response behavior changed compared to pre-phase baseline.
|
3. No user-visible response behavior changed compared to pre-phase baseline.
|
||||||
|
|
||||||
Follow-up status (2026-02-27): live channel-session artifacts exist under `docs/plans/artifacts/phase0_baseline_live_2026-02-27.*` via `pnpm audit:phase0-baseline:live` (anonymized IDs), and a second gateway-origin live window (including `run.cancel` + `cancel_requested`/`cancelled`) exists under `docs/plans/artifacts/phase0_baseline_live_gateway_2026-02-27.*`. Gateway window refreshes can now run via `pnpm audit:phase0-baseline:live:gateway` (auto-selected cancel window), both windows can be refreshed together with `pnpm audit:phase0-baseline:live:refresh` (scheduling example included in README), and backend-scoped channel windows are now available via `pnpm audit:phase0-baseline:live:pi` / `pnpm audit:phase0-baseline:live:native`.
|
Follow-up status (2026-02-27): live channel-session artifacts exist under `docs/plans/artifacts/phase0_baseline_live_2026-02-27.*` via `pnpm audit:phase0-baseline:live` (anonymized IDs), and a second gateway-origin live window (including `run.cancel` + `cancel_requested`/`cancelled`) exists under `docs/plans/artifacts/phase0_baseline_live_gateway_2026-02-27.*`. Gateway window refreshes can now run via `pnpm audit:phase0-baseline:live:gateway` (auto-selected cancel window), both windows can be refreshed together with `pnpm audit:phase0-baseline:live:refresh` (scheduling example included in README), backend-scoped channel windows are now available via `pnpm audit:phase0-baseline:live:pi` / `pnpm audit:phase0-baseline:live:native`, and backend artifact freshness/drift checks are now available via `pnpm audit:phase0-baseline:live:drift` (or chained with `pnpm audit:phase0-baseline:live:refresh:drift`).
|
||||||
|
|
||||||
## Subagent Model Assignment Plan
|
## Subagent Model Assignment Plan
|
||||||
|
|
||||||
|
|||||||
+22
-3
@@ -196,6 +196,25 @@
|
|||||||
],
|
],
|
||||||
"test_status": "pnpm audit:phase0-baseline:live:pi + pnpm audit:phase0-baseline:live:native + pnpm test:run src/audit/phase0LiveBaseline.test.ts src/audit/phase0BaselineSummary.test.ts + pnpm typecheck passing"
|
"test_status": "pnpm audit:phase0-baseline:live:pi + pnpm audit:phase0-baseline:live:native + pnpm test:run src/audit/phase0LiveBaseline.test.ts src/audit/phase0BaselineSummary.test.ts + pnpm typecheck passing"
|
||||||
},
|
},
|
||||||
|
"phase0-live-baseline-backend-drift-monitoring": {
|
||||||
|
"status": "completed",
|
||||||
|
"date": "2026-02-27",
|
||||||
|
"updated": "2026-02-27",
|
||||||
|
"summary": "Implemented backend-scoped phase-0 baseline drift/freshness gates for cadence monitoring by adding artifact comparison helpers, threshold evaluation, and an auto-discovery CLI (`check-phase0-baseline-backend-drift.ts`) with package scripts for standalone checks and chained refresh+drift runs.",
|
||||||
|
"files_modified": [
|
||||||
|
"src/audit/phase0BaselineDrift.ts",
|
||||||
|
"src/audit/phase0BaselineDrift.test.ts",
|
||||||
|
"scripts/check-phase0-baseline-backend-drift.ts",
|
||||||
|
"package.json",
|
||||||
|
"README.md",
|
||||||
|
"docs/api/PROTOCOL.md",
|
||||||
|
"docs/architecture/AGENT_DIAGRAM.md",
|
||||||
|
"docs/architecture/GATEWAY_SESSIONS_AND_QUEUE.md",
|
||||||
|
"docs/plans/2026-02-25-phase0-instrumentation-ticket-checklist.md",
|
||||||
|
"docs/plans/state.json"
|
||||||
|
],
|
||||||
|
"test_status": "pnpm test:run src/audit/phase0BaselineDrift.test.ts + pnpm audit:phase0-baseline:live:drift + pnpm typecheck passing"
|
||||||
|
},
|
||||||
"phase0-instrumentation-ticket-checklist": {
|
"phase0-instrumentation-ticket-checklist": {
|
||||||
"status": "completed",
|
"status": "completed",
|
||||||
"date": "2026-02-25",
|
"date": "2026-02-25",
|
||||||
@@ -7345,7 +7364,7 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"overall_progress": {
|
"overall_progress": {
|
||||||
"total_test_count": 2589,
|
"total_test_count": 2590,
|
||||||
"all_tests_passing": true,
|
"all_tests_passing": true,
|
||||||
"p0_completion": "3/3 (100%)",
|
"p0_completion": "3/3 (100%)",
|
||||||
"p1_completion": "4/4 (100%)",
|
"p1_completion": "4/4 (100%)",
|
||||||
@@ -7381,8 +7400,8 @@
|
|||||||
"deeper_surfaces_phase0_ticket_02": "completed — gateway + daemon routing emit run lifecycle/cancel telemetry and reaction match/skip audit events with filter summaries and cancellation latency, plus focused tests",
|
"deeper_surfaces_phase0_ticket_02": "completed — gateway + daemon routing emit run lifecycle/cancel telemetry and reaction match/skip audit events with filter summaries and cancellation latency, plus focused tests",
|
||||||
"deeper_surfaces_phase0_ticket_03": "completed — gateway metrics now track run-state outcomes, cancel latency samples, and reaction decision counters with routing/gateway emitters",
|
"deeper_surfaces_phase0_ticket_03": "completed — gateway metrics now track run-state outcomes, cancel latency samples, and reaction decision counters with routing/gateway emitters",
|
||||||
"deeper_surfaces_phase0_ticket_04": "completed — added phase-0 baseline summary tooling for run outcomes, cancel latency, and reaction decisions with markdown/json CLI output",
|
"deeper_surfaces_phase0_ticket_04": "completed — added phase-0 baseline summary tooling for run outcomes, cancel latency, and reaction decisions with markdown/json CLI output",
|
||||||
"deeper_surfaces_phase0_ticket_05": "completed — documented phase-0 telemetry fields/workflow, refreshed architecture/protocol docs, and generated anonymized live baseline artifacts for channel, gateway, and backend-scoped (pi/native) traffic windows",
|
"deeper_surfaces_phase0_ticket_05": "completed — documented phase-0 telemetry fields/workflow, refreshed architecture/protocol docs, generated anonymized live baseline artifacts for channel/gateway/backend-scoped (pi/native) windows, and added backend artifact freshness/drift gates (`pnpm audit:phase0-baseline:live:drift`)",
|
||||||
"next_up": "Apply scheduled `pnpm audit:phase0-baseline:live:refresh` in each active environment and monitor backend-scoped (`pi_embedded` vs `native`) artifact freshness/drift over at least one full cadence cycle before additional run-control/reaction semantic changes.",
|
"next_up": "Run scheduled `pnpm audit:phase0-baseline:live:refresh:drift` in each active environment and observe at least one full cadence cycle before tightening drift thresholds or changing additional run-control/reaction semantics.",
|
||||||
"pi_embedded_canary_spike": "completed — added optional pi_embedded backend adapter, canary-safe no-tools routing guard, backend success/fallback latency audit events, and docs/diagram updates while native remains default",
|
"pi_embedded_canary_spike": "completed — added optional pi_embedded backend adapter, canary-safe no-tools routing guard, backend success/fallback latency audit events, and docs/diagram updates while native remains default",
|
||||||
"pi_embedded_evaluation_phase": "completed — final decision rollback (applied in runtime config): Window A failed latency/fallback gates (p50 +259ms, p95 +5695ms, fallback 25%, categories: pi_module_interface/empty_assistant_text); Window B remained sample-insufficient; controlled probes verified guard coverage (pi_no_tools_mode/capability_query/attachments_present each hit once)",
|
"pi_embedded_evaluation_phase": "completed — final decision rollback (applied in runtime config): Window A failed latency/fallback gates (p50 +259ms, p95 +5695ms, fallback 25%, categories: pi_module_interface/empty_assistant_text); Window B remained sample-insufficient; controlled probes verified guard coverage (pi_no_tools_mode/capability_query/attachments_present each hit once)",
|
||||||
"pi_embedded_manual_mode": "completed — added persisted runtime backend controls for manual Pi activation/deactivation (`/runtime` preferred, `/backend` alias; `status`, `activate pi`, `deactivate pi`, `use config`) while keeping config-driven default routing",
|
"pi_embedded_manual_mode": "completed — added persisted runtime backend controls for manual Pi activation/deactivation (`/runtime` preferred, `/backend` alias; `status`, `activate pi`, `deactivate pi`, `use config`) while keeping config-driven default routing",
|
||||||
|
|||||||
@@ -27,6 +27,8 @@
|
|||||||
"audit:phase0-baseline:live:native": "node --import tsx/esm scripts/capture-phase0-live-baseline.ts --audit ~/.local/share/flynn/audit.log --source channel --backend native --exclude-session-substring probe",
|
"audit:phase0-baseline:live:native": "node --import tsx/esm scripts/capture-phase0-live-baseline.ts --audit ~/.local/share/flynn/audit.log --source channel --backend native --exclude-session-substring probe",
|
||||||
"audit:phase0-baseline:live:gateway": "node --import tsx/esm scripts/capture-phase0-live-baseline.ts --audit ~/.local/share/flynn/audit.log --source gateway --auto-gateway-cancel-window",
|
"audit:phase0-baseline:live:gateway": "node --import tsx/esm scripts/capture-phase0-live-baseline.ts --audit ~/.local/share/flynn/audit.log --source gateway --auto-gateway-cancel-window",
|
||||||
"audit:phase0-baseline:live:refresh": "pnpm audit:phase0-baseline:live && pnpm audit:phase0-baseline:live:gateway",
|
"audit:phase0-baseline:live:refresh": "pnpm audit:phase0-baseline:live && pnpm audit:phase0-baseline:live:gateway",
|
||||||
|
"audit:phase0-baseline:live:drift": "node --import tsx/esm scripts/check-phase0-baseline-backend-drift.ts --artifacts-dir docs/plans/artifacts --backend pi_embedded,native --max-age-hours 36 --min-candidate-sampled-events 10 --max-sampled-events-drop-pct 80 --max-run-outcomes-drop-pct 80 --max-completion-rate-drop-pp 35 --max-cancel-rate-increase-pp 25 --max-error-rate-increase-pp 25 --max-cancel-latency-p95-increase-ms 6000",
|
||||||
|
"audit:phase0-baseline:live:refresh:drift": "pnpm audit:phase0-baseline:live:refresh && pnpm audit:phase0-baseline:live:drift",
|
||||||
"audit:backend-canary:probes": "node --import tsx/esm scripts/run-pi-canary-guard-probes.ts",
|
"audit:backend-canary:probes": "node --import tsx/esm scripts/run-pi-canary-guard-probes.ts",
|
||||||
"companion:bundle": "node --import tsx/esm scripts/build-companion-release-bundle.ts",
|
"companion:bundle": "node --import tsx/esm scripts/build-companion-release-bundle.ts",
|
||||||
"companion:reference-apps": "node --import tsx/esm scripts/export-companion-reference-apps.ts",
|
"companion:reference-apps": "node --import tsx/esm scripts/export-companion-reference-apps.ts",
|
||||||
|
|||||||
@@ -0,0 +1,446 @@
|
|||||||
|
#!/usr/bin/env node
|
||||||
|
|
||||||
|
import { mkdir, readdir, readFile, writeFile } from 'node:fs/promises';
|
||||||
|
import { dirname, resolve } from 'node:path';
|
||||||
|
import { parseArgs } from 'node:util';
|
||||||
|
import {
|
||||||
|
comparePhase0BaselineDrift,
|
||||||
|
evaluatePhase0BaselineDriftGate,
|
||||||
|
renderPhase0BaselineDriftSnapshot,
|
||||||
|
type Phase0BaselineArtifactDocument,
|
||||||
|
type Phase0BaselineDriftComparison,
|
||||||
|
type Phase0BaselineDriftGateThresholds,
|
||||||
|
} from '../src/audit/phase0BaselineDrift.js';
|
||||||
|
import type { Phase0BackendTarget } from '../src/audit/phase0LiveBaseline.js';
|
||||||
|
|
||||||
|
type OutputFormat = 'markdown' | 'json';
|
||||||
|
|
||||||
|
interface ArtifactRecord {
|
||||||
|
backend: Phase0BackendTarget;
|
||||||
|
tag: string;
|
||||||
|
path: string;
|
||||||
|
generatedAtIso?: string;
|
||||||
|
generatedAtMs?: number;
|
||||||
|
document: Phase0BaselineArtifactDocument;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface BackendDriftResult {
|
||||||
|
backend: Phase0BackendTarget;
|
||||||
|
candidate: ArtifactRecord;
|
||||||
|
baseline?: ArtifactRecord;
|
||||||
|
comparison: Phase0BaselineDriftComparison;
|
||||||
|
freshness: {
|
||||||
|
enabled: boolean;
|
||||||
|
pass: boolean;
|
||||||
|
actual_age_hours: number | null;
|
||||||
|
threshold_hours: number | null;
|
||||||
|
};
|
||||||
|
driftGate: ReturnType<typeof evaluatePhase0BaselineDriftGate>;
|
||||||
|
pass: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
const BACKEND_TARGETS: readonly Phase0BackendTarget[] = [
|
||||||
|
'native',
|
||||||
|
'claude_code',
|
||||||
|
'opencode',
|
||||||
|
'codex',
|
||||||
|
'gemini',
|
||||||
|
'pi_embedded',
|
||||||
|
];
|
||||||
|
|
||||||
|
const ARTIFACT_JSON_PATTERN = /^phase0_baseline_live_backend_(native|claude_code|opencode|codex|gemini|pi_embedded)_(.+)\.json$/;
|
||||||
|
|
||||||
|
function usage(): string {
|
||||||
|
return [
|
||||||
|
'Usage: node --import tsx/esm scripts/check-phase0-baseline-backend-drift.ts [options]',
|
||||||
|
'',
|
||||||
|
'Options:',
|
||||||
|
' --artifacts-dir <path> Artifacts directory (default: docs/plans/artifacts)',
|
||||||
|
' --backend <name[,name...]> Backends to check (default: pi_embedded,native)',
|
||||||
|
' --tag <value> Candidate artifact tag (default: latest available per backend)',
|
||||||
|
' --baseline-tag <value> Baseline artifact tag (default: previous available per backend)',
|
||||||
|
' --max-age-hours <number> Require candidate artifact freshness (optional)',
|
||||||
|
' --require-baseline-history Fail when no prior artifact exists',
|
||||||
|
' --format <markdown|json> Output format (default: markdown)',
|
||||||
|
' --out <path> Write output to file instead of stdout',
|
||||||
|
'',
|
||||||
|
'Drift thresholds (optional):',
|
||||||
|
' --min-candidate-sampled-events <number>',
|
||||||
|
' --min-baseline-sampled-events <number>',
|
||||||
|
' --max-sampled-events-drop-pct <number>',
|
||||||
|
' --max-run-outcomes-drop-pct <number>',
|
||||||
|
' --max-completion-rate-drop-pp <number>',
|
||||||
|
' --max-cancel-rate-increase-pp <number>',
|
||||||
|
' --max-error-rate-increase-pp <number>',
|
||||||
|
' --max-cancel-latency-p95-increase-ms <number>',
|
||||||
|
].join('\n');
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseCsv(value: string | undefined): string[] | undefined {
|
||||||
|
if (!value) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
const values = value
|
||||||
|
.split(',')
|
||||||
|
.map((item) => item.trim())
|
||||||
|
.filter((item) => item.length > 0);
|
||||||
|
return values.length > 0 ? values : undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseOptionalNumber(raw: string | undefined, flag: string): number | undefined {
|
||||||
|
if (!raw) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
const parsed = Number(raw);
|
||||||
|
if (!Number.isFinite(parsed)) {
|
||||||
|
throw new Error(`Invalid ${flag} value "${raw}". Expected a number.`);
|
||||||
|
}
|
||||||
|
return parsed;
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseBackends(raw: string | undefined): Phase0BackendTarget[] {
|
||||||
|
const values = parseCsv(raw) ?? ['pi_embedded', 'native'];
|
||||||
|
const parsed: Phase0BackendTarget[] = [];
|
||||||
|
for (const value of values) {
|
||||||
|
if (BACKEND_TARGETS.includes(value as Phase0BackendTarget)) {
|
||||||
|
parsed.push(value as Phase0BackendTarget);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
throw new Error(`Invalid backend "${value}".`);
|
||||||
|
}
|
||||||
|
return parsed;
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseFormat(raw: string | undefined): OutputFormat {
|
||||||
|
const format = raw ?? 'markdown';
|
||||||
|
if (format !== 'markdown' && format !== 'json') {
|
||||||
|
throw new Error(`Invalid --format value "${format}".`);
|
||||||
|
}
|
||||||
|
return format;
|
||||||
|
}
|
||||||
|
|
||||||
|
function sortRecordsDesc(records: ArtifactRecord[]): ArtifactRecord[] {
|
||||||
|
return [...records].sort((a, b) => {
|
||||||
|
const aTs = a.generatedAtMs ?? 0;
|
||||||
|
const bTs = b.generatedAtMs ?? 0;
|
||||||
|
if (aTs !== bTs) {
|
||||||
|
return bTs - aTs;
|
||||||
|
}
|
||||||
|
return b.tag.localeCompare(a.tag);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function formatSignedNumber(value: number | null, suffix = ''): string {
|
||||||
|
if (value === null || !Number.isFinite(value)) {
|
||||||
|
return 'n/a';
|
||||||
|
}
|
||||||
|
const rounded = Math.round(value * 100) / 100;
|
||||||
|
const sign = rounded > 0 ? '+' : '';
|
||||||
|
return `${sign}${rounded}${suffix}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function formatFreshnessHours(value: number | null): string {
|
||||||
|
if (value === null || !Number.isFinite(value)) {
|
||||||
|
return 'n/a';
|
||||||
|
}
|
||||||
|
return `${Math.round(value * 100) / 100}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function writeOutput(pathValue: string, output: string): Promise<void> {
|
||||||
|
await mkdir(dirname(pathValue), { recursive: true });
|
||||||
|
await writeFile(pathValue, `${output}\n`, 'utf8');
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildThresholds(values: Record<string, string | boolean | undefined>): Phase0BaselineDriftGateThresholds {
|
||||||
|
return {
|
||||||
|
requireBaselineHistory: Boolean(values['require-baseline-history']),
|
||||||
|
minCandidateSampledEvents: parseOptionalNumber(values['min-candidate-sampled-events'] as string | undefined, '--min-candidate-sampled-events'),
|
||||||
|
minBaselineSampledEvents: parseOptionalNumber(values['min-baseline-sampled-events'] as string | undefined, '--min-baseline-sampled-events'),
|
||||||
|
maxSampledEventsDropPct: parseOptionalNumber(values['max-sampled-events-drop-pct'] as string | undefined, '--max-sampled-events-drop-pct'),
|
||||||
|
maxRunOutcomesDropPct: parseOptionalNumber(values['max-run-outcomes-drop-pct'] as string | undefined, '--max-run-outcomes-drop-pct'),
|
||||||
|
maxCompletionRateDropPp: parseOptionalNumber(values['max-completion-rate-drop-pp'] as string | undefined, '--max-completion-rate-drop-pp'),
|
||||||
|
maxCancelRateIncreasePp: parseOptionalNumber(values['max-cancel-rate-increase-pp'] as string | undefined, '--max-cancel-rate-increase-pp'),
|
||||||
|
maxErrorRateIncreasePp: parseOptionalNumber(values['max-error-rate-increase-pp'] as string | undefined, '--max-error-rate-increase-pp'),
|
||||||
|
maxCancelLatencyP95IncreaseMs: parseOptionalNumber(values['max-cancel-latency-p95-increase-ms'] as string | undefined, '--max-cancel-latency-p95-increase-ms'),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
async function readArtifactRecords(artifactsDir: string): Promise<ArtifactRecord[]> {
|
||||||
|
const files = await readdir(artifactsDir);
|
||||||
|
const records: ArtifactRecord[] = [];
|
||||||
|
|
||||||
|
for (const file of files) {
|
||||||
|
const match = ARTIFACT_JSON_PATTERN.exec(file);
|
||||||
|
if (!match) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const backend = match[1] as Phase0BackendTarget;
|
||||||
|
const tag = match[2] ?? '';
|
||||||
|
const path = resolve(artifactsDir, file);
|
||||||
|
const raw = await readFile(path, 'utf8');
|
||||||
|
const document = JSON.parse(raw) as Phase0BaselineArtifactDocument;
|
||||||
|
const generatedAtIso = typeof document.generated_at === 'string' ? document.generated_at : undefined;
|
||||||
|
const generatedAtMs = generatedAtIso ? Date.parse(generatedAtIso) : NaN;
|
||||||
|
|
||||||
|
records.push({
|
||||||
|
backend,
|
||||||
|
tag,
|
||||||
|
path,
|
||||||
|
generatedAtIso,
|
||||||
|
generatedAtMs: Number.isFinite(generatedAtMs) ? generatedAtMs : undefined,
|
||||||
|
document,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return records;
|
||||||
|
}
|
||||||
|
|
||||||
|
function pickCandidate(records: ArtifactRecord[], explicitTag?: string): ArtifactRecord {
|
||||||
|
if (explicitTag) {
|
||||||
|
const match = records.find((record) => record.tag === explicitTag);
|
||||||
|
if (!match) {
|
||||||
|
throw new Error(`No artifact found for candidate tag "${explicitTag}".`);
|
||||||
|
}
|
||||||
|
return match;
|
||||||
|
}
|
||||||
|
|
||||||
|
const sorted = sortRecordsDesc(records);
|
||||||
|
const latest = sorted[0];
|
||||||
|
if (!latest) {
|
||||||
|
throw new Error('No candidate artifact found.');
|
||||||
|
}
|
||||||
|
return latest;
|
||||||
|
}
|
||||||
|
|
||||||
|
function pickBaseline(records: ArtifactRecord[], candidate: ArtifactRecord, explicitBaselineTag?: string): ArtifactRecord | undefined {
|
||||||
|
if (explicitBaselineTag) {
|
||||||
|
const match = records.find((record) => record.tag === explicitBaselineTag);
|
||||||
|
if (!match) {
|
||||||
|
throw new Error(`No artifact found for baseline tag "${explicitBaselineTag}".`);
|
||||||
|
}
|
||||||
|
return match;
|
||||||
|
}
|
||||||
|
|
||||||
|
const sorted = sortRecordsDesc(records);
|
||||||
|
for (const record of sorted) {
|
||||||
|
if (record.path !== candidate.path) {
|
||||||
|
return record;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
function renderMarkdown(
|
||||||
|
artifactsDir: string,
|
||||||
|
backends: Phase0BackendTarget[],
|
||||||
|
thresholds: Phase0BaselineDriftGateThresholds,
|
||||||
|
maxAgeHours: number | undefined,
|
||||||
|
results: BackendDriftResult[],
|
||||||
|
overallPass: boolean,
|
||||||
|
): string {
|
||||||
|
const lines: string[] = [];
|
||||||
|
lines.push('# Phase-0 Backend Drift Check');
|
||||||
|
lines.push('');
|
||||||
|
lines.push(`Generated at: ${new Date().toISOString()}`);
|
||||||
|
lines.push(`Artifacts: ${artifactsDir}`);
|
||||||
|
lines.push(`Backends: ${backends.join(', ')}`);
|
||||||
|
if (typeof maxAgeHours === 'number') {
|
||||||
|
lines.push(`Freshness max age (hours): ${maxAgeHours}`);
|
||||||
|
} else {
|
||||||
|
lines.push('Freshness max age (hours): disabled');
|
||||||
|
}
|
||||||
|
lines.push(`Overall gate: ${overallPass ? 'PASS' : 'FAIL'}`);
|
||||||
|
lines.push('');
|
||||||
|
|
||||||
|
const thresholdEntries = Object.entries(thresholds).filter(([, value]) => value !== undefined);
|
||||||
|
lines.push('## Thresholds');
|
||||||
|
if (thresholdEntries.length === 0) {
|
||||||
|
lines.push('- none (report-only mode)');
|
||||||
|
} else {
|
||||||
|
for (const [key, value] of thresholdEntries) {
|
||||||
|
lines.push(`- ${key}: ${String(value)}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
lines.push('');
|
||||||
|
|
||||||
|
for (const result of results) {
|
||||||
|
lines.push(`## ${result.backend}`);
|
||||||
|
lines.push(`- status: ${result.pass ? 'PASS' : 'FAIL'}`);
|
||||||
|
lines.push(`- candidate: tag=${result.candidate.tag} file=${result.candidate.path}`);
|
||||||
|
lines.push(`- candidate generated_at: ${result.candidate.generatedAtIso ?? 'n/a'}`);
|
||||||
|
if (result.baseline) {
|
||||||
|
lines.push(`- baseline: tag=${result.baseline.tag} file=${result.baseline.path}`);
|
||||||
|
lines.push(`- baseline generated_at: ${result.baseline.generatedAtIso ?? 'n/a'}`);
|
||||||
|
} else {
|
||||||
|
lines.push('- baseline: none');
|
||||||
|
}
|
||||||
|
lines.push(`- candidate snapshot: ${renderPhase0BaselineDriftSnapshot(result.comparison.candidate)}`);
|
||||||
|
if (result.comparison.baseline) {
|
||||||
|
lines.push(`- baseline snapshot: ${renderPhase0BaselineDriftSnapshot(result.comparison.baseline)}`);
|
||||||
|
}
|
||||||
|
lines.push('- deltas:');
|
||||||
|
lines.push(` sampled_event_count_pct=${formatSignedNumber(result.comparison.deltas.sampled_event_count_pct, '%')}`);
|
||||||
|
lines.push(` run_total_outcomes_pct=${formatSignedNumber(result.comparison.deltas.run_total_outcomes_pct, '%')}`);
|
||||||
|
lines.push(` completion_rate_pp=${formatSignedNumber(result.comparison.deltas.completion_rate_pp)}`);
|
||||||
|
lines.push(` cancel_rate_pp=${formatSignedNumber(result.comparison.deltas.cancel_rate_pp)}`);
|
||||||
|
lines.push(` error_rate_pp=${formatSignedNumber(result.comparison.deltas.error_rate_pp)}`);
|
||||||
|
lines.push(` cancel_latency_p95_ms=${formatSignedNumber(result.comparison.deltas.cancel_latency_p95_ms)}`);
|
||||||
|
lines.push(` reaction_match_rate_pp=${formatSignedNumber(result.comparison.deltas.reaction_match_rate_pp)}`);
|
||||||
|
lines.push(` reaction_skip_rate_pp=${formatSignedNumber(result.comparison.deltas.reaction_skip_rate_pp)}`);
|
||||||
|
lines.push(`- freshness gate: ${result.freshness.pass ? 'PASS' : 'FAIL'} (age_hours=${formatFreshnessHours(result.freshness.actual_age_hours)} threshold=${result.freshness.threshold_hours ?? 'n/a'})`);
|
||||||
|
lines.push(`- drift gate: ${result.driftGate.pass ? 'PASS' : 'FAIL'}`);
|
||||||
|
if (result.driftGate.criteria.length === 0) {
|
||||||
|
lines.push(' criteria: none');
|
||||||
|
} else {
|
||||||
|
for (const criterion of result.driftGate.criteria) {
|
||||||
|
lines.push(` ${criterion.pass ? 'PASS' : 'FAIL'} ${criterion.criterion} actual=${criterion.actual} threshold=${criterion.threshold}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
lines.push('');
|
||||||
|
}
|
||||||
|
|
||||||
|
return lines.join('\n');
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main(): Promise<void> {
|
||||||
|
const { values } = parseArgs({
|
||||||
|
options: {
|
||||||
|
'artifacts-dir': { type: 'string' },
|
||||||
|
backend: { type: 'string' },
|
||||||
|
tag: { type: 'string' },
|
||||||
|
'baseline-tag': { type: 'string' },
|
||||||
|
'max-age-hours': { type: 'string' },
|
||||||
|
'require-baseline-history': { type: 'boolean' },
|
||||||
|
'min-candidate-sampled-events': { type: 'string' },
|
||||||
|
'min-baseline-sampled-events': { type: 'string' },
|
||||||
|
'max-sampled-events-drop-pct': { type: 'string' },
|
||||||
|
'max-run-outcomes-drop-pct': { type: 'string' },
|
||||||
|
'max-completion-rate-drop-pp': { type: 'string' },
|
||||||
|
'max-cancel-rate-increase-pp': { type: 'string' },
|
||||||
|
'max-error-rate-increase-pp': { type: 'string' },
|
||||||
|
'max-cancel-latency-p95-increase-ms': { type: 'string' },
|
||||||
|
format: { type: 'string' },
|
||||||
|
out: { type: 'string' },
|
||||||
|
help: { type: 'boolean', short: 'h' },
|
||||||
|
},
|
||||||
|
strict: true,
|
||||||
|
allowPositionals: false,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (values.help) {
|
||||||
|
process.stdout.write(`${usage()}\n`);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const artifactsDir = resolve(values['artifacts-dir'] ?? 'docs/plans/artifacts');
|
||||||
|
const backends = parseBackends(values.backend);
|
||||||
|
const candidateTag = values.tag;
|
||||||
|
const baselineTag = values['baseline-tag'];
|
||||||
|
const format = parseFormat(values.format);
|
||||||
|
const maxAgeHours = parseOptionalNumber(values['max-age-hours'], '--max-age-hours');
|
||||||
|
if (typeof maxAgeHours === 'number' && maxAgeHours < 0) {
|
||||||
|
throw new Error('--max-age-hours must be >= 0.');
|
||||||
|
}
|
||||||
|
|
||||||
|
const thresholds = buildThresholds(values as Record<string, string | boolean | undefined>);
|
||||||
|
const allRecords = await readArtifactRecords(artifactsDir);
|
||||||
|
const nowMs = Date.now();
|
||||||
|
const results: BackendDriftResult[] = [];
|
||||||
|
|
||||||
|
for (const backend of backends) {
|
||||||
|
const backendRecords = allRecords.filter((record) => record.backend === backend);
|
||||||
|
if (backendRecords.length === 0) {
|
||||||
|
throw new Error(`No backend artifact JSON files found for "${backend}" in ${artifactsDir}.`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const candidate = pickCandidate(backendRecords, candidateTag);
|
||||||
|
const baseline = pickBaseline(backendRecords, candidate, baselineTag);
|
||||||
|
const comparison = comparePhase0BaselineDrift(candidate.document, baseline?.document);
|
||||||
|
const driftGate = evaluatePhase0BaselineDriftGate(comparison, thresholds);
|
||||||
|
|
||||||
|
const freshness = (() => {
|
||||||
|
if (typeof maxAgeHours !== 'number') {
|
||||||
|
return {
|
||||||
|
enabled: false,
|
||||||
|
pass: true,
|
||||||
|
actual_age_hours: null,
|
||||||
|
threshold_hours: null,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
if (typeof candidate.generatedAtMs !== 'number') {
|
||||||
|
return {
|
||||||
|
enabled: true,
|
||||||
|
pass: false,
|
||||||
|
actual_age_hours: null,
|
||||||
|
threshold_hours: maxAgeHours,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
const ageHours = Math.max(0, (nowMs - candidate.generatedAtMs) / (1000 * 60 * 60));
|
||||||
|
return {
|
||||||
|
enabled: true,
|
||||||
|
pass: ageHours <= maxAgeHours,
|
||||||
|
actual_age_hours: Math.round(ageHours * 100) / 100,
|
||||||
|
threshold_hours: maxAgeHours,
|
||||||
|
};
|
||||||
|
})();
|
||||||
|
|
||||||
|
results.push({
|
||||||
|
backend,
|
||||||
|
candidate,
|
||||||
|
baseline,
|
||||||
|
comparison,
|
||||||
|
freshness,
|
||||||
|
driftGate,
|
||||||
|
pass: freshness.pass && driftGate.pass,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
const overallPass = results.every((result) => result.pass);
|
||||||
|
const output = format === 'json'
|
||||||
|
? JSON.stringify({
|
||||||
|
generated_at: new Date().toISOString(),
|
||||||
|
artifacts_dir: artifactsDir,
|
||||||
|
backends,
|
||||||
|
candidate_tag: candidateTag,
|
||||||
|
baseline_tag: baselineTag,
|
||||||
|
max_age_hours: maxAgeHours,
|
||||||
|
thresholds,
|
||||||
|
overall_pass: overallPass,
|
||||||
|
results: results.map((result) => ({
|
||||||
|
backend: result.backend,
|
||||||
|
pass: result.pass,
|
||||||
|
candidate: {
|
||||||
|
tag: result.candidate.tag,
|
||||||
|
path: result.candidate.path,
|
||||||
|
generated_at: result.candidate.generatedAtIso,
|
||||||
|
},
|
||||||
|
baseline: result.baseline
|
||||||
|
? {
|
||||||
|
tag: result.baseline.tag,
|
||||||
|
path: result.baseline.path,
|
||||||
|
generated_at: result.baseline.generatedAtIso,
|
||||||
|
}
|
||||||
|
: null,
|
||||||
|
comparison: result.comparison,
|
||||||
|
freshness: result.freshness,
|
||||||
|
drift_gate: result.driftGate,
|
||||||
|
})),
|
||||||
|
}, null, 2)
|
||||||
|
: renderMarkdown(artifactsDir, backends, thresholds, maxAgeHours, results, overallPass);
|
||||||
|
|
||||||
|
if (values.out) {
|
||||||
|
await writeOutput(resolve(values.out), output);
|
||||||
|
} else {
|
||||||
|
process.stdout.write(`${output}\n`);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!overallPass) {
|
||||||
|
process.exitCode = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch((error) => {
|
||||||
|
const message = error instanceof Error ? error.message : String(error);
|
||||||
|
process.stderr.write(`${message}\n\n${usage()}\n`);
|
||||||
|
process.exitCode = 1;
|
||||||
|
});
|
||||||
@@ -0,0 +1,340 @@
|
|||||||
|
import { describe, expect, it } from 'vitest';
|
||||||
|
import {
|
||||||
|
comparePhase0BaselineDrift,
|
||||||
|
evaluatePhase0BaselineDriftGate,
|
||||||
|
extractPhase0BaselineDriftSnapshot,
|
||||||
|
} from './phase0BaselineDrift.js';
|
||||||
|
|
||||||
|
describe('phase0BaselineDrift', () => {
|
||||||
|
it('extracts a normalized snapshot from artifact payloads', () => {
|
||||||
|
const snapshot = extractPhase0BaselineDriftSnapshot({
|
||||||
|
source_event_count: 120,
|
||||||
|
sampled_event_count: 60,
|
||||||
|
summary: {
|
||||||
|
event_counts: {
|
||||||
|
run_state: 0,
|
||||||
|
run_cancel: 0,
|
||||||
|
reaction_match: 0,
|
||||||
|
reaction_skip: 0,
|
||||||
|
},
|
||||||
|
run_outcomes: {
|
||||||
|
overall: {
|
||||||
|
total_outcomes: 25,
|
||||||
|
complete: 20,
|
||||||
|
cancelled: 3,
|
||||||
|
error: 2,
|
||||||
|
cancel_requested: 0,
|
||||||
|
start: 25,
|
||||||
|
completion_rate_pct: 80,
|
||||||
|
cancel_rate_pct: 12,
|
||||||
|
error_rate_pct: 8,
|
||||||
|
},
|
||||||
|
by_channel: [],
|
||||||
|
by_session: [],
|
||||||
|
},
|
||||||
|
cancel_latency_ms: {
|
||||||
|
count: 2,
|
||||||
|
avg_ms: 120,
|
||||||
|
p50_ms: 100,
|
||||||
|
p95_ms: 180,
|
||||||
|
min_ms: 80,
|
||||||
|
max_ms: 220,
|
||||||
|
},
|
||||||
|
reactions: {
|
||||||
|
matched: 10,
|
||||||
|
skipped: 5,
|
||||||
|
total: 15,
|
||||||
|
match_rate_pct: 66.67,
|
||||||
|
skip_rate_pct: 33.33,
|
||||||
|
skip_reasons: [],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(snapshot).toEqual({
|
||||||
|
source_event_count: 120,
|
||||||
|
sampled_event_count: 60,
|
||||||
|
run_total_outcomes: 25,
|
||||||
|
completion_rate_pct: 80,
|
||||||
|
cancel_rate_pct: 12,
|
||||||
|
error_rate_pct: 8,
|
||||||
|
cancel_latency_p95_ms: 180,
|
||||||
|
reaction_match_rate_pct: 66.67,
|
||||||
|
reaction_skip_rate_pct: 33.33,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it('computes deltas between baseline and candidate artifacts', () => {
|
||||||
|
const comparison = comparePhase0BaselineDrift(
|
||||||
|
{
|
||||||
|
sampled_event_count: 45,
|
||||||
|
summary: {
|
||||||
|
event_counts: {
|
||||||
|
run_state: 0,
|
||||||
|
run_cancel: 0,
|
||||||
|
reaction_match: 0,
|
||||||
|
reaction_skip: 0,
|
||||||
|
},
|
||||||
|
run_outcomes: {
|
||||||
|
overall: {
|
||||||
|
total_outcomes: 18,
|
||||||
|
complete: 15,
|
||||||
|
cancelled: 2,
|
||||||
|
error: 1,
|
||||||
|
cancel_requested: 0,
|
||||||
|
start: 18,
|
||||||
|
completion_rate_pct: 83.33,
|
||||||
|
cancel_rate_pct: 11.11,
|
||||||
|
error_rate_pct: 5.56,
|
||||||
|
},
|
||||||
|
by_channel: [],
|
||||||
|
by_session: [],
|
||||||
|
},
|
||||||
|
cancel_latency_ms: {
|
||||||
|
count: 1,
|
||||||
|
avg_ms: 50,
|
||||||
|
p50_ms: 50,
|
||||||
|
p95_ms: 50,
|
||||||
|
min_ms: 50,
|
||||||
|
max_ms: 50,
|
||||||
|
},
|
||||||
|
reactions: {
|
||||||
|
matched: 3,
|
||||||
|
skipped: 1,
|
||||||
|
total: 4,
|
||||||
|
match_rate_pct: 75,
|
||||||
|
skip_rate_pct: 25,
|
||||||
|
skip_reasons: [],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
sampled_event_count: 60,
|
||||||
|
summary: {
|
||||||
|
event_counts: {
|
||||||
|
run_state: 0,
|
||||||
|
run_cancel: 0,
|
||||||
|
reaction_match: 0,
|
||||||
|
reaction_skip: 0,
|
||||||
|
},
|
||||||
|
run_outcomes: {
|
||||||
|
overall: {
|
||||||
|
total_outcomes: 20,
|
||||||
|
complete: 18,
|
||||||
|
cancelled: 1,
|
||||||
|
error: 1,
|
||||||
|
cancel_requested: 0,
|
||||||
|
start: 20,
|
||||||
|
completion_rate_pct: 90,
|
||||||
|
cancel_rate_pct: 5,
|
||||||
|
error_rate_pct: 5,
|
||||||
|
},
|
||||||
|
by_channel: [],
|
||||||
|
by_session: [],
|
||||||
|
},
|
||||||
|
cancel_latency_ms: {
|
||||||
|
count: 1,
|
||||||
|
avg_ms: 80,
|
||||||
|
p50_ms: 80,
|
||||||
|
p95_ms: 80,
|
||||||
|
min_ms: 80,
|
||||||
|
max_ms: 80,
|
||||||
|
},
|
||||||
|
reactions: {
|
||||||
|
matched: 7,
|
||||||
|
skipped: 3,
|
||||||
|
total: 10,
|
||||||
|
match_rate_pct: 70,
|
||||||
|
skip_rate_pct: 30,
|
||||||
|
skip_reasons: [],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
expect(comparison.deltas).toEqual({
|
||||||
|
sampled_event_count_pct: -25,
|
||||||
|
run_total_outcomes_pct: -10,
|
||||||
|
completion_rate_pp: -6.67,
|
||||||
|
cancel_rate_pp: 6.11,
|
||||||
|
error_rate_pp: 0.56,
|
||||||
|
cancel_latency_p95_ms: -30,
|
||||||
|
reaction_match_rate_pp: 5,
|
||||||
|
reaction_skip_rate_pp: -5,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it('evaluates drift thresholds against candidate deltas', () => {
|
||||||
|
const comparison = comparePhase0BaselineDrift(
|
||||||
|
{
|
||||||
|
sampled_event_count: 80,
|
||||||
|
summary: {
|
||||||
|
event_counts: {
|
||||||
|
run_state: 0,
|
||||||
|
run_cancel: 0,
|
||||||
|
reaction_match: 0,
|
||||||
|
reaction_skip: 0,
|
||||||
|
},
|
||||||
|
run_outcomes: {
|
||||||
|
overall: {
|
||||||
|
total_outcomes: 30,
|
||||||
|
complete: 24,
|
||||||
|
cancelled: 3,
|
||||||
|
error: 3,
|
||||||
|
cancel_requested: 0,
|
||||||
|
start: 30,
|
||||||
|
completion_rate_pct: 80,
|
||||||
|
cancel_rate_pct: 10,
|
||||||
|
error_rate_pct: 10,
|
||||||
|
},
|
||||||
|
by_channel: [],
|
||||||
|
by_session: [],
|
||||||
|
},
|
||||||
|
cancel_latency_ms: {
|
||||||
|
count: 1,
|
||||||
|
avg_ms: 200,
|
||||||
|
p50_ms: 200,
|
||||||
|
p95_ms: 200,
|
||||||
|
min_ms: 200,
|
||||||
|
max_ms: 200,
|
||||||
|
},
|
||||||
|
reactions: {
|
||||||
|
matched: 0,
|
||||||
|
skipped: 0,
|
||||||
|
total: 0,
|
||||||
|
match_rate_pct: null,
|
||||||
|
skip_rate_pct: null,
|
||||||
|
skip_reasons: [],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
sampled_event_count: 100,
|
||||||
|
summary: {
|
||||||
|
event_counts: {
|
||||||
|
run_state: 0,
|
||||||
|
run_cancel: 0,
|
||||||
|
reaction_match: 0,
|
||||||
|
reaction_skip: 0,
|
||||||
|
},
|
||||||
|
run_outcomes: {
|
||||||
|
overall: {
|
||||||
|
total_outcomes: 40,
|
||||||
|
complete: 38,
|
||||||
|
cancelled: 1,
|
||||||
|
error: 1,
|
||||||
|
cancel_requested: 0,
|
||||||
|
start: 40,
|
||||||
|
completion_rate_pct: 95,
|
||||||
|
cancel_rate_pct: 2.5,
|
||||||
|
error_rate_pct: 2.5,
|
||||||
|
},
|
||||||
|
by_channel: [],
|
||||||
|
by_session: [],
|
||||||
|
},
|
||||||
|
cancel_latency_ms: {
|
||||||
|
count: 1,
|
||||||
|
avg_ms: 120,
|
||||||
|
p50_ms: 120,
|
||||||
|
p95_ms: 120,
|
||||||
|
min_ms: 120,
|
||||||
|
max_ms: 120,
|
||||||
|
},
|
||||||
|
reactions: {
|
||||||
|
matched: 0,
|
||||||
|
skipped: 0,
|
||||||
|
total: 0,
|
||||||
|
match_rate_pct: null,
|
||||||
|
skip_rate_pct: null,
|
||||||
|
skip_reasons: [],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
const passResult = evaluatePhase0BaselineDriftGate(comparison, {
|
||||||
|
minCandidateSampledEvents: 50,
|
||||||
|
minBaselineSampledEvents: 50,
|
||||||
|
maxSampledEventsDropPct: 30,
|
||||||
|
maxRunOutcomesDropPct: 30,
|
||||||
|
maxCompletionRateDropPp: 20,
|
||||||
|
maxCancelRateIncreasePp: 12,
|
||||||
|
maxErrorRateIncreasePp: 12,
|
||||||
|
maxCancelLatencyP95IncreaseMs: 100,
|
||||||
|
});
|
||||||
|
expect(passResult.pass).toBe(true);
|
||||||
|
expect(passResult.criteria.every((row) => row.pass)).toBe(true);
|
||||||
|
|
||||||
|
const failResult = evaluatePhase0BaselineDriftGate(comparison, {
|
||||||
|
maxCompletionRateDropPp: 10,
|
||||||
|
maxCancelRateIncreasePp: 5,
|
||||||
|
maxErrorRateIncreasePp: 5,
|
||||||
|
maxCancelLatencyP95IncreaseMs: 50,
|
||||||
|
});
|
||||||
|
expect(failResult.pass).toBe(false);
|
||||||
|
expect(failResult.criteria.filter((row) => !row.pass).map((row) => row.criterion)).toEqual([
|
||||||
|
'completion_rate_drop_pp',
|
||||||
|
'cancel_rate_increase_pp',
|
||||||
|
'error_rate_increase_pp',
|
||||||
|
'cancel_latency_p95_increase_ms',
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('supports missing baseline history and optional strict requirement', () => {
|
||||||
|
const noBaseline = comparePhase0BaselineDrift({
|
||||||
|
sampled_event_count: 12,
|
||||||
|
summary: {
|
||||||
|
event_counts: {
|
||||||
|
run_state: 0,
|
||||||
|
run_cancel: 0,
|
||||||
|
reaction_match: 0,
|
||||||
|
reaction_skip: 0,
|
||||||
|
},
|
||||||
|
run_outcomes: {
|
||||||
|
overall: {
|
||||||
|
total_outcomes: 3,
|
||||||
|
complete: 3,
|
||||||
|
cancelled: 0,
|
||||||
|
error: 0,
|
||||||
|
cancel_requested: 0,
|
||||||
|
start: 3,
|
||||||
|
completion_rate_pct: 100,
|
||||||
|
cancel_rate_pct: 0,
|
||||||
|
error_rate_pct: 0,
|
||||||
|
},
|
||||||
|
by_channel: [],
|
||||||
|
by_session: [],
|
||||||
|
},
|
||||||
|
cancel_latency_ms: null,
|
||||||
|
reactions: {
|
||||||
|
matched: 0,
|
||||||
|
skipped: 0,
|
||||||
|
total: 0,
|
||||||
|
match_rate_pct: null,
|
||||||
|
skip_rate_pct: null,
|
||||||
|
skip_reasons: [],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const relaxed = evaluatePhase0BaselineDriftGate(noBaseline, {
|
||||||
|
maxSampledEventsDropPct: 25,
|
||||||
|
});
|
||||||
|
expect(relaxed.pass).toBe(true);
|
||||||
|
expect(relaxed.criteria[0]).toMatchObject({
|
||||||
|
criterion: 'sampled_events_drop_pct',
|
||||||
|
pass: true,
|
||||||
|
actual: 'n/a',
|
||||||
|
});
|
||||||
|
|
||||||
|
const strict = evaluatePhase0BaselineDriftGate(noBaseline, {
|
||||||
|
requireBaselineHistory: true,
|
||||||
|
maxSampledEventsDropPct: 25,
|
||||||
|
});
|
||||||
|
expect(strict.pass).toBe(false);
|
||||||
|
expect(strict.criteria.filter((row) => !row.pass).map((row) => row.criterion)).toEqual([
|
||||||
|
'baseline_history',
|
||||||
|
'sampled_events_drop_pct',
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -0,0 +1,336 @@
|
|||||||
|
import type { Phase0BaselineSummary } from './phase0BaselineSummary.js';
|
||||||
|
|
||||||
|
export interface Phase0BaselineArtifactDocument {
|
||||||
|
generated_at?: string;
|
||||||
|
source_event_count?: number;
|
||||||
|
sampled_event_count?: number;
|
||||||
|
summary?: Phase0BaselineSummary;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface Phase0BaselineDriftSnapshot {
|
||||||
|
source_event_count: number;
|
||||||
|
sampled_event_count: number;
|
||||||
|
run_total_outcomes: number;
|
||||||
|
completion_rate_pct: number | null;
|
||||||
|
cancel_rate_pct: number | null;
|
||||||
|
error_rate_pct: number | null;
|
||||||
|
cancel_latency_p95_ms: number | null;
|
||||||
|
reaction_match_rate_pct: number | null;
|
||||||
|
reaction_skip_rate_pct: number | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface Phase0BaselineDriftDeltas {
|
||||||
|
sampled_event_count_pct: number | null;
|
||||||
|
run_total_outcomes_pct: number | null;
|
||||||
|
completion_rate_pp: number | null;
|
||||||
|
cancel_rate_pp: number | null;
|
||||||
|
error_rate_pp: number | null;
|
||||||
|
cancel_latency_p95_ms: number | null;
|
||||||
|
reaction_match_rate_pp: number | null;
|
||||||
|
reaction_skip_rate_pp: number | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface Phase0BaselineDriftComparison {
|
||||||
|
baseline: Phase0BaselineDriftSnapshot | null;
|
||||||
|
candidate: Phase0BaselineDriftSnapshot;
|
||||||
|
deltas: Phase0BaselineDriftDeltas;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface Phase0BaselineDriftGateThresholds {
|
||||||
|
requireBaselineHistory?: boolean;
|
||||||
|
minCandidateSampledEvents?: number;
|
||||||
|
minBaselineSampledEvents?: number;
|
||||||
|
maxSampledEventsDropPct?: number;
|
||||||
|
maxRunOutcomesDropPct?: number;
|
||||||
|
maxCompletionRateDropPp?: number;
|
||||||
|
maxCancelRateIncreasePp?: number;
|
||||||
|
maxErrorRateIncreasePp?: number;
|
||||||
|
maxCancelLatencyP95IncreaseMs?: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface Phase0BaselineDriftGateCriterion {
|
||||||
|
criterion: string;
|
||||||
|
pass: boolean;
|
||||||
|
actual: string;
|
||||||
|
threshold: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface Phase0BaselineDriftGateResult {
|
||||||
|
pass: boolean;
|
||||||
|
criteria: Phase0BaselineDriftGateCriterion[];
|
||||||
|
}
|
||||||
|
|
||||||
|
function readFiniteNumber(value: unknown): number | undefined {
|
||||||
|
return typeof value === 'number' && Number.isFinite(value) ? value : undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
function readFiniteNumberOrNull(value: unknown): number | null {
|
||||||
|
const parsed = readFiniteNumber(value);
|
||||||
|
return typeof parsed === 'number' ? parsed : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function readThreshold(value: unknown): number | undefined {
|
||||||
|
const parsed = readFiniteNumber(value);
|
||||||
|
if (typeof parsed !== 'number') {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
return parsed;
|
||||||
|
}
|
||||||
|
|
||||||
|
function toPctDelta(baseline: number, candidate: number): number | null {
|
||||||
|
if (!Number.isFinite(baseline) || baseline <= 0 || !Number.isFinite(candidate)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return Math.round((((candidate - baseline) / baseline) * 100) * 100) / 100;
|
||||||
|
}
|
||||||
|
|
||||||
|
function toRateDeltaPp(baseline: number | null, candidate: number | null): number | null {
|
||||||
|
if (baseline === null || candidate === null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return Math.round((candidate - baseline) * 100) / 100;
|
||||||
|
}
|
||||||
|
|
||||||
|
function formatNumber(value: number | null, suffix = ''): string {
|
||||||
|
if (value === null || !Number.isFinite(value)) {
|
||||||
|
return 'n/a';
|
||||||
|
}
|
||||||
|
return `${value}${suffix}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function extractPhase0BaselineDriftSnapshot(
|
||||||
|
artifact: Phase0BaselineArtifactDocument,
|
||||||
|
): Phase0BaselineDriftSnapshot {
|
||||||
|
const summary = artifact.summary;
|
||||||
|
const runOverall = summary?.run_outcomes?.overall;
|
||||||
|
const reactions = summary?.reactions;
|
||||||
|
const cancelLatency = summary?.cancel_latency_ms;
|
||||||
|
|
||||||
|
return {
|
||||||
|
source_event_count: readFiniteNumber(artifact.source_event_count) ?? 0,
|
||||||
|
sampled_event_count: readFiniteNumber(artifact.sampled_event_count) ?? 0,
|
||||||
|
run_total_outcomes: readFiniteNumber(runOverall?.total_outcomes) ?? 0,
|
||||||
|
completion_rate_pct: readFiniteNumberOrNull(runOverall?.completion_rate_pct),
|
||||||
|
cancel_rate_pct: readFiniteNumberOrNull(runOverall?.cancel_rate_pct),
|
||||||
|
error_rate_pct: readFiniteNumberOrNull(runOverall?.error_rate_pct),
|
||||||
|
cancel_latency_p95_ms: readFiniteNumberOrNull(cancelLatency?.p95_ms),
|
||||||
|
reaction_match_rate_pct: readFiniteNumberOrNull(reactions?.match_rate_pct),
|
||||||
|
reaction_skip_rate_pct: readFiniteNumberOrNull(reactions?.skip_rate_pct),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export function comparePhase0BaselineDrift(
|
||||||
|
candidateArtifact: Phase0BaselineArtifactDocument,
|
||||||
|
baselineArtifact?: Phase0BaselineArtifactDocument | null,
|
||||||
|
): Phase0BaselineDriftComparison {
|
||||||
|
const candidate = extractPhase0BaselineDriftSnapshot(candidateArtifact);
|
||||||
|
const baseline = baselineArtifact ? extractPhase0BaselineDriftSnapshot(baselineArtifact) : null;
|
||||||
|
|
||||||
|
return {
|
||||||
|
baseline,
|
||||||
|
candidate,
|
||||||
|
deltas: {
|
||||||
|
sampled_event_count_pct: baseline
|
||||||
|
? toPctDelta(baseline.sampled_event_count, candidate.sampled_event_count)
|
||||||
|
: null,
|
||||||
|
run_total_outcomes_pct: baseline
|
||||||
|
? toPctDelta(baseline.run_total_outcomes, candidate.run_total_outcomes)
|
||||||
|
: null,
|
||||||
|
completion_rate_pp: toRateDeltaPp(baseline?.completion_rate_pct ?? null, candidate.completion_rate_pct),
|
||||||
|
cancel_rate_pp: toRateDeltaPp(baseline?.cancel_rate_pct ?? null, candidate.cancel_rate_pct),
|
||||||
|
error_rate_pp: toRateDeltaPp(baseline?.error_rate_pct ?? null, candidate.error_rate_pct),
|
||||||
|
cancel_latency_p95_ms: toRateDeltaPp(baseline?.cancel_latency_p95_ms ?? null, candidate.cancel_latency_p95_ms),
|
||||||
|
reaction_match_rate_pp: toRateDeltaPp(baseline?.reaction_match_rate_pct ?? null, candidate.reaction_match_rate_pct),
|
||||||
|
reaction_skip_rate_pp: toRateDeltaPp(baseline?.reaction_skip_rate_pct ?? null, candidate.reaction_skip_rate_pct),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export function evaluatePhase0BaselineDriftGate(
|
||||||
|
comparison: Phase0BaselineDriftComparison,
|
||||||
|
thresholds: Phase0BaselineDriftGateThresholds,
|
||||||
|
): Phase0BaselineDriftGateResult {
|
||||||
|
const criteria: Phase0BaselineDriftGateCriterion[] = [];
|
||||||
|
const requireBaselineHistory = Boolean(thresholds.requireBaselineHistory);
|
||||||
|
const baseline = comparison.baseline;
|
||||||
|
|
||||||
|
if (requireBaselineHistory) {
|
||||||
|
criteria.push({
|
||||||
|
criterion: 'baseline_history',
|
||||||
|
pass: baseline !== null,
|
||||||
|
actual: baseline ? 'present' : 'missing',
|
||||||
|
threshold: 'required',
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
const minCandidateSampledEvents = readThreshold(thresholds.minCandidateSampledEvents);
|
||||||
|
if (typeof minCandidateSampledEvents === 'number') {
|
||||||
|
criteria.push({
|
||||||
|
criterion: 'candidate_sampled_events',
|
||||||
|
pass: comparison.candidate.sampled_event_count >= minCandidateSampledEvents,
|
||||||
|
actual: String(comparison.candidate.sampled_event_count),
|
||||||
|
threshold: `>= ${minCandidateSampledEvents}`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
const minBaselineSampledEvents = readThreshold(thresholds.minBaselineSampledEvents);
|
||||||
|
if (typeof minBaselineSampledEvents === 'number') {
|
||||||
|
if (!baseline) {
|
||||||
|
criteria.push({
|
||||||
|
criterion: 'baseline_sampled_events',
|
||||||
|
pass: !requireBaselineHistory,
|
||||||
|
actual: 'n/a',
|
||||||
|
threshold: `>= ${minBaselineSampledEvents}`,
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
criteria.push({
|
||||||
|
criterion: 'baseline_sampled_events',
|
||||||
|
pass: baseline.sampled_event_count >= minBaselineSampledEvents,
|
||||||
|
actual: String(baseline.sampled_event_count),
|
||||||
|
threshold: `>= ${minBaselineSampledEvents}`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const maxSampledEventsDropPct = readThreshold(thresholds.maxSampledEventsDropPct);
|
||||||
|
if (typeof maxSampledEventsDropPct === 'number') {
|
||||||
|
const delta = comparison.deltas.sampled_event_count_pct;
|
||||||
|
if (delta === null) {
|
||||||
|
criteria.push({
|
||||||
|
criterion: 'sampled_events_drop_pct',
|
||||||
|
pass: !requireBaselineHistory,
|
||||||
|
actual: 'n/a',
|
||||||
|
threshold: `<= ${maxSampledEventsDropPct}`,
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
const drop = Math.max(0, -delta);
|
||||||
|
criteria.push({
|
||||||
|
criterion: 'sampled_events_drop_pct',
|
||||||
|
pass: drop <= maxSampledEventsDropPct,
|
||||||
|
actual: `${Math.round(drop * 100) / 100}`,
|
||||||
|
threshold: `<= ${maxSampledEventsDropPct}`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const maxRunOutcomesDropPct = readThreshold(thresholds.maxRunOutcomesDropPct);
|
||||||
|
if (typeof maxRunOutcomesDropPct === 'number') {
|
||||||
|
const delta = comparison.deltas.run_total_outcomes_pct;
|
||||||
|
if (delta === null) {
|
||||||
|
criteria.push({
|
||||||
|
criterion: 'run_outcomes_drop_pct',
|
||||||
|
pass: !requireBaselineHistory,
|
||||||
|
actual: 'n/a',
|
||||||
|
threshold: `<= ${maxRunOutcomesDropPct}`,
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
const drop = Math.max(0, -delta);
|
||||||
|
criteria.push({
|
||||||
|
criterion: 'run_outcomes_drop_pct',
|
||||||
|
pass: drop <= maxRunOutcomesDropPct,
|
||||||
|
actual: `${Math.round(drop * 100) / 100}`,
|
||||||
|
threshold: `<= ${maxRunOutcomesDropPct}`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const maxCompletionRateDropPp = readThreshold(thresholds.maxCompletionRateDropPp);
|
||||||
|
if (typeof maxCompletionRateDropPp === 'number') {
|
||||||
|
const delta = comparison.deltas.completion_rate_pp;
|
||||||
|
if (delta === null) {
|
||||||
|
criteria.push({
|
||||||
|
criterion: 'completion_rate_drop_pp',
|
||||||
|
pass: !requireBaselineHistory,
|
||||||
|
actual: 'n/a',
|
||||||
|
threshold: `<= ${maxCompletionRateDropPp}`,
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
const drop = Math.max(0, -delta);
|
||||||
|
criteria.push({
|
||||||
|
criterion: 'completion_rate_drop_pp',
|
||||||
|
pass: drop <= maxCompletionRateDropPp,
|
||||||
|
actual: `${Math.round(drop * 100) / 100}`,
|
||||||
|
threshold: `<= ${maxCompletionRateDropPp}`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const maxCancelRateIncreasePp = readThreshold(thresholds.maxCancelRateIncreasePp);
|
||||||
|
if (typeof maxCancelRateIncreasePp === 'number') {
|
||||||
|
const delta = comparison.deltas.cancel_rate_pp;
|
||||||
|
if (delta === null) {
|
||||||
|
criteria.push({
|
||||||
|
criterion: 'cancel_rate_increase_pp',
|
||||||
|
pass: !requireBaselineHistory,
|
||||||
|
actual: 'n/a',
|
||||||
|
threshold: `<= ${maxCancelRateIncreasePp}`,
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
const increase = Math.max(0, delta);
|
||||||
|
criteria.push({
|
||||||
|
criterion: 'cancel_rate_increase_pp',
|
||||||
|
pass: increase <= maxCancelRateIncreasePp,
|
||||||
|
actual: `${Math.round(increase * 100) / 100}`,
|
||||||
|
threshold: `<= ${maxCancelRateIncreasePp}`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const maxErrorRateIncreasePp = readThreshold(thresholds.maxErrorRateIncreasePp);
|
||||||
|
if (typeof maxErrorRateIncreasePp === 'number') {
|
||||||
|
const delta = comparison.deltas.error_rate_pp;
|
||||||
|
if (delta === null) {
|
||||||
|
criteria.push({
|
||||||
|
criterion: 'error_rate_increase_pp',
|
||||||
|
pass: !requireBaselineHistory,
|
||||||
|
actual: 'n/a',
|
||||||
|
threshold: `<= ${maxErrorRateIncreasePp}`,
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
const increase = Math.max(0, delta);
|
||||||
|
criteria.push({
|
||||||
|
criterion: 'error_rate_increase_pp',
|
||||||
|
pass: increase <= maxErrorRateIncreasePp,
|
||||||
|
actual: `${Math.round(increase * 100) / 100}`,
|
||||||
|
threshold: `<= ${maxErrorRateIncreasePp}`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const maxCancelLatencyP95IncreaseMs = readThreshold(thresholds.maxCancelLatencyP95IncreaseMs);
|
||||||
|
if (typeof maxCancelLatencyP95IncreaseMs === 'number') {
|
||||||
|
const delta = comparison.deltas.cancel_latency_p95_ms;
|
||||||
|
if (delta === null) {
|
||||||
|
criteria.push({
|
||||||
|
criterion: 'cancel_latency_p95_increase_ms',
|
||||||
|
pass: !requireBaselineHistory,
|
||||||
|
actual: 'n/a',
|
||||||
|
threshold: `<= ${maxCancelLatencyP95IncreaseMs}`,
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
const increase = Math.max(0, delta);
|
||||||
|
criteria.push({
|
||||||
|
criterion: 'cancel_latency_p95_increase_ms',
|
||||||
|
pass: increase <= maxCancelLatencyP95IncreaseMs,
|
||||||
|
actual: `${Math.round(increase * 100) / 100}`,
|
||||||
|
threshold: `<= ${maxCancelLatencyP95IncreaseMs}`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
pass: criteria.every((row) => row.pass),
|
||||||
|
criteria,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export function renderPhase0BaselineDriftSnapshot(snapshot: Phase0BaselineDriftSnapshot): string {
|
||||||
|
return [
|
||||||
|
`sampled=${snapshot.sampled_event_count}`,
|
||||||
|
`outcomes=${snapshot.run_total_outcomes}`,
|
||||||
|
`completion=${formatNumber(snapshot.completion_rate_pct, '%')}`,
|
||||||
|
`cancel=${formatNumber(snapshot.cancel_rate_pct, '%')}`,
|
||||||
|
`error=${formatNumber(snapshot.error_rate_pct, '%')}`,
|
||||||
|
`cancel_p95_ms=${formatNumber(snapshot.cancel_latency_p95_ms)}`,
|
||||||
|
].join(' ');
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user