feat(audit): add phase0 backend drift and freshness gates

This commit is contained in:
William Valentin
2026-02-27 09:01:43 -08:00
parent 68cdc2cf8b
commit 20224f1601
10 changed files with 1157 additions and 7 deletions
+340
View File
@@ -0,0 +1,340 @@
import { describe, expect, it } from 'vitest';
import {
comparePhase0BaselineDrift,
evaluatePhase0BaselineDriftGate,
extractPhase0BaselineDriftSnapshot,
} from './phase0BaselineDrift.js';
describe('phase0BaselineDrift', () => {
it('extracts a normalized snapshot from artifact payloads', () => {
const snapshot = extractPhase0BaselineDriftSnapshot({
source_event_count: 120,
sampled_event_count: 60,
summary: {
event_counts: {
run_state: 0,
run_cancel: 0,
reaction_match: 0,
reaction_skip: 0,
},
run_outcomes: {
overall: {
total_outcomes: 25,
complete: 20,
cancelled: 3,
error: 2,
cancel_requested: 0,
start: 25,
completion_rate_pct: 80,
cancel_rate_pct: 12,
error_rate_pct: 8,
},
by_channel: [],
by_session: [],
},
cancel_latency_ms: {
count: 2,
avg_ms: 120,
p50_ms: 100,
p95_ms: 180,
min_ms: 80,
max_ms: 220,
},
reactions: {
matched: 10,
skipped: 5,
total: 15,
match_rate_pct: 66.67,
skip_rate_pct: 33.33,
skip_reasons: [],
},
},
});
expect(snapshot).toEqual({
source_event_count: 120,
sampled_event_count: 60,
run_total_outcomes: 25,
completion_rate_pct: 80,
cancel_rate_pct: 12,
error_rate_pct: 8,
cancel_latency_p95_ms: 180,
reaction_match_rate_pct: 66.67,
reaction_skip_rate_pct: 33.33,
});
});
it('computes deltas between baseline and candidate artifacts', () => {
const comparison = comparePhase0BaselineDrift(
{
sampled_event_count: 45,
summary: {
event_counts: {
run_state: 0,
run_cancel: 0,
reaction_match: 0,
reaction_skip: 0,
},
run_outcomes: {
overall: {
total_outcomes: 18,
complete: 15,
cancelled: 2,
error: 1,
cancel_requested: 0,
start: 18,
completion_rate_pct: 83.33,
cancel_rate_pct: 11.11,
error_rate_pct: 5.56,
},
by_channel: [],
by_session: [],
},
cancel_latency_ms: {
count: 1,
avg_ms: 50,
p50_ms: 50,
p95_ms: 50,
min_ms: 50,
max_ms: 50,
},
reactions: {
matched: 3,
skipped: 1,
total: 4,
match_rate_pct: 75,
skip_rate_pct: 25,
skip_reasons: [],
},
},
},
{
sampled_event_count: 60,
summary: {
event_counts: {
run_state: 0,
run_cancel: 0,
reaction_match: 0,
reaction_skip: 0,
},
run_outcomes: {
overall: {
total_outcomes: 20,
complete: 18,
cancelled: 1,
error: 1,
cancel_requested: 0,
start: 20,
completion_rate_pct: 90,
cancel_rate_pct: 5,
error_rate_pct: 5,
},
by_channel: [],
by_session: [],
},
cancel_latency_ms: {
count: 1,
avg_ms: 80,
p50_ms: 80,
p95_ms: 80,
min_ms: 80,
max_ms: 80,
},
reactions: {
matched: 7,
skipped: 3,
total: 10,
match_rate_pct: 70,
skip_rate_pct: 30,
skip_reasons: [],
},
},
},
);
expect(comparison.deltas).toEqual({
sampled_event_count_pct: -25,
run_total_outcomes_pct: -10,
completion_rate_pp: -6.67,
cancel_rate_pp: 6.11,
error_rate_pp: 0.56,
cancel_latency_p95_ms: -30,
reaction_match_rate_pp: 5,
reaction_skip_rate_pp: -5,
});
});
it('evaluates drift thresholds against candidate deltas', () => {
const comparison = comparePhase0BaselineDrift(
{
sampled_event_count: 80,
summary: {
event_counts: {
run_state: 0,
run_cancel: 0,
reaction_match: 0,
reaction_skip: 0,
},
run_outcomes: {
overall: {
total_outcomes: 30,
complete: 24,
cancelled: 3,
error: 3,
cancel_requested: 0,
start: 30,
completion_rate_pct: 80,
cancel_rate_pct: 10,
error_rate_pct: 10,
},
by_channel: [],
by_session: [],
},
cancel_latency_ms: {
count: 1,
avg_ms: 200,
p50_ms: 200,
p95_ms: 200,
min_ms: 200,
max_ms: 200,
},
reactions: {
matched: 0,
skipped: 0,
total: 0,
match_rate_pct: null,
skip_rate_pct: null,
skip_reasons: [],
},
},
},
{
sampled_event_count: 100,
summary: {
event_counts: {
run_state: 0,
run_cancel: 0,
reaction_match: 0,
reaction_skip: 0,
},
run_outcomes: {
overall: {
total_outcomes: 40,
complete: 38,
cancelled: 1,
error: 1,
cancel_requested: 0,
start: 40,
completion_rate_pct: 95,
cancel_rate_pct: 2.5,
error_rate_pct: 2.5,
},
by_channel: [],
by_session: [],
},
cancel_latency_ms: {
count: 1,
avg_ms: 120,
p50_ms: 120,
p95_ms: 120,
min_ms: 120,
max_ms: 120,
},
reactions: {
matched: 0,
skipped: 0,
total: 0,
match_rate_pct: null,
skip_rate_pct: null,
skip_reasons: [],
},
},
},
);
const passResult = evaluatePhase0BaselineDriftGate(comparison, {
minCandidateSampledEvents: 50,
minBaselineSampledEvents: 50,
maxSampledEventsDropPct: 30,
maxRunOutcomesDropPct: 30,
maxCompletionRateDropPp: 20,
maxCancelRateIncreasePp: 12,
maxErrorRateIncreasePp: 12,
maxCancelLatencyP95IncreaseMs: 100,
});
expect(passResult.pass).toBe(true);
expect(passResult.criteria.every((row) => row.pass)).toBe(true);
const failResult = evaluatePhase0BaselineDriftGate(comparison, {
maxCompletionRateDropPp: 10,
maxCancelRateIncreasePp: 5,
maxErrorRateIncreasePp: 5,
maxCancelLatencyP95IncreaseMs: 50,
});
expect(failResult.pass).toBe(false);
expect(failResult.criteria.filter((row) => !row.pass).map((row) => row.criterion)).toEqual([
'completion_rate_drop_pp',
'cancel_rate_increase_pp',
'error_rate_increase_pp',
'cancel_latency_p95_increase_ms',
]);
});
it('supports missing baseline history and optional strict requirement', () => {
const noBaseline = comparePhase0BaselineDrift({
sampled_event_count: 12,
summary: {
event_counts: {
run_state: 0,
run_cancel: 0,
reaction_match: 0,
reaction_skip: 0,
},
run_outcomes: {
overall: {
total_outcomes: 3,
complete: 3,
cancelled: 0,
error: 0,
cancel_requested: 0,
start: 3,
completion_rate_pct: 100,
cancel_rate_pct: 0,
error_rate_pct: 0,
},
by_channel: [],
by_session: [],
},
cancel_latency_ms: null,
reactions: {
matched: 0,
skipped: 0,
total: 0,
match_rate_pct: null,
skip_rate_pct: null,
skip_reasons: [],
},
},
});
const relaxed = evaluatePhase0BaselineDriftGate(noBaseline, {
maxSampledEventsDropPct: 25,
});
expect(relaxed.pass).toBe(true);
expect(relaxed.criteria[0]).toMatchObject({
criterion: 'sampled_events_drop_pct',
pass: true,
actual: 'n/a',
});
const strict = evaluatePhase0BaselineDriftGate(noBaseline, {
requireBaselineHistory: true,
maxSampledEventsDropPct: 25,
});
expect(strict.pass).toBe(false);
expect(strict.criteria.filter((row) => !row.pass).map((row) => row.criterion)).toEqual([
'baseline_history',
'sampled_events_drop_pct',
]);
});
});
+336
View File
@@ -0,0 +1,336 @@
import type { Phase0BaselineSummary } from './phase0BaselineSummary.js';
export interface Phase0BaselineArtifactDocument {
generated_at?: string;
source_event_count?: number;
sampled_event_count?: number;
summary?: Phase0BaselineSummary;
}
export interface Phase0BaselineDriftSnapshot {
source_event_count: number;
sampled_event_count: number;
run_total_outcomes: number;
completion_rate_pct: number | null;
cancel_rate_pct: number | null;
error_rate_pct: number | null;
cancel_latency_p95_ms: number | null;
reaction_match_rate_pct: number | null;
reaction_skip_rate_pct: number | null;
}
export interface Phase0BaselineDriftDeltas {
sampled_event_count_pct: number | null;
run_total_outcomes_pct: number | null;
completion_rate_pp: number | null;
cancel_rate_pp: number | null;
error_rate_pp: number | null;
cancel_latency_p95_ms: number | null;
reaction_match_rate_pp: number | null;
reaction_skip_rate_pp: number | null;
}
export interface Phase0BaselineDriftComparison {
baseline: Phase0BaselineDriftSnapshot | null;
candidate: Phase0BaselineDriftSnapshot;
deltas: Phase0BaselineDriftDeltas;
}
export interface Phase0BaselineDriftGateThresholds {
requireBaselineHistory?: boolean;
minCandidateSampledEvents?: number;
minBaselineSampledEvents?: number;
maxSampledEventsDropPct?: number;
maxRunOutcomesDropPct?: number;
maxCompletionRateDropPp?: number;
maxCancelRateIncreasePp?: number;
maxErrorRateIncreasePp?: number;
maxCancelLatencyP95IncreaseMs?: number;
}
export interface Phase0BaselineDriftGateCriterion {
criterion: string;
pass: boolean;
actual: string;
threshold: string;
}
export interface Phase0BaselineDriftGateResult {
pass: boolean;
criteria: Phase0BaselineDriftGateCriterion[];
}
function readFiniteNumber(value: unknown): number | undefined {
return typeof value === 'number' && Number.isFinite(value) ? value : undefined;
}
function readFiniteNumberOrNull(value: unknown): number | null {
const parsed = readFiniteNumber(value);
return typeof parsed === 'number' ? parsed : null;
}
function readThreshold(value: unknown): number | undefined {
const parsed = readFiniteNumber(value);
if (typeof parsed !== 'number') {
return undefined;
}
return parsed;
}
function toPctDelta(baseline: number, candidate: number): number | null {
if (!Number.isFinite(baseline) || baseline <= 0 || !Number.isFinite(candidate)) {
return null;
}
return Math.round((((candidate - baseline) / baseline) * 100) * 100) / 100;
}
function toRateDeltaPp(baseline: number | null, candidate: number | null): number | null {
if (baseline === null || candidate === null) {
return null;
}
return Math.round((candidate - baseline) * 100) / 100;
}
function formatNumber(value: number | null, suffix = ''): string {
if (value === null || !Number.isFinite(value)) {
return 'n/a';
}
return `${value}${suffix}`;
}
export function extractPhase0BaselineDriftSnapshot(
artifact: Phase0BaselineArtifactDocument,
): Phase0BaselineDriftSnapshot {
const summary = artifact.summary;
const runOverall = summary?.run_outcomes?.overall;
const reactions = summary?.reactions;
const cancelLatency = summary?.cancel_latency_ms;
return {
source_event_count: readFiniteNumber(artifact.source_event_count) ?? 0,
sampled_event_count: readFiniteNumber(artifact.sampled_event_count) ?? 0,
run_total_outcomes: readFiniteNumber(runOverall?.total_outcomes) ?? 0,
completion_rate_pct: readFiniteNumberOrNull(runOverall?.completion_rate_pct),
cancel_rate_pct: readFiniteNumberOrNull(runOverall?.cancel_rate_pct),
error_rate_pct: readFiniteNumberOrNull(runOverall?.error_rate_pct),
cancel_latency_p95_ms: readFiniteNumberOrNull(cancelLatency?.p95_ms),
reaction_match_rate_pct: readFiniteNumberOrNull(reactions?.match_rate_pct),
reaction_skip_rate_pct: readFiniteNumberOrNull(reactions?.skip_rate_pct),
};
}
export function comparePhase0BaselineDrift(
candidateArtifact: Phase0BaselineArtifactDocument,
baselineArtifact?: Phase0BaselineArtifactDocument | null,
): Phase0BaselineDriftComparison {
const candidate = extractPhase0BaselineDriftSnapshot(candidateArtifact);
const baseline = baselineArtifact ? extractPhase0BaselineDriftSnapshot(baselineArtifact) : null;
return {
baseline,
candidate,
deltas: {
sampled_event_count_pct: baseline
? toPctDelta(baseline.sampled_event_count, candidate.sampled_event_count)
: null,
run_total_outcomes_pct: baseline
? toPctDelta(baseline.run_total_outcomes, candidate.run_total_outcomes)
: null,
completion_rate_pp: toRateDeltaPp(baseline?.completion_rate_pct ?? null, candidate.completion_rate_pct),
cancel_rate_pp: toRateDeltaPp(baseline?.cancel_rate_pct ?? null, candidate.cancel_rate_pct),
error_rate_pp: toRateDeltaPp(baseline?.error_rate_pct ?? null, candidate.error_rate_pct),
cancel_latency_p95_ms: toRateDeltaPp(baseline?.cancel_latency_p95_ms ?? null, candidate.cancel_latency_p95_ms),
reaction_match_rate_pp: toRateDeltaPp(baseline?.reaction_match_rate_pct ?? null, candidate.reaction_match_rate_pct),
reaction_skip_rate_pp: toRateDeltaPp(baseline?.reaction_skip_rate_pct ?? null, candidate.reaction_skip_rate_pct),
},
};
}
export function evaluatePhase0BaselineDriftGate(
comparison: Phase0BaselineDriftComparison,
thresholds: Phase0BaselineDriftGateThresholds,
): Phase0BaselineDriftGateResult {
const criteria: Phase0BaselineDriftGateCriterion[] = [];
const requireBaselineHistory = Boolean(thresholds.requireBaselineHistory);
const baseline = comparison.baseline;
if (requireBaselineHistory) {
criteria.push({
criterion: 'baseline_history',
pass: baseline !== null,
actual: baseline ? 'present' : 'missing',
threshold: 'required',
});
}
const minCandidateSampledEvents = readThreshold(thresholds.minCandidateSampledEvents);
if (typeof minCandidateSampledEvents === 'number') {
criteria.push({
criterion: 'candidate_sampled_events',
pass: comparison.candidate.sampled_event_count >= minCandidateSampledEvents,
actual: String(comparison.candidate.sampled_event_count),
threshold: `>= ${minCandidateSampledEvents}`,
});
}
const minBaselineSampledEvents = readThreshold(thresholds.minBaselineSampledEvents);
if (typeof minBaselineSampledEvents === 'number') {
if (!baseline) {
criteria.push({
criterion: 'baseline_sampled_events',
pass: !requireBaselineHistory,
actual: 'n/a',
threshold: `>= ${minBaselineSampledEvents}`,
});
} else {
criteria.push({
criterion: 'baseline_sampled_events',
pass: baseline.sampled_event_count >= minBaselineSampledEvents,
actual: String(baseline.sampled_event_count),
threshold: `>= ${minBaselineSampledEvents}`,
});
}
}
const maxSampledEventsDropPct = readThreshold(thresholds.maxSampledEventsDropPct);
if (typeof maxSampledEventsDropPct === 'number') {
const delta = comparison.deltas.sampled_event_count_pct;
if (delta === null) {
criteria.push({
criterion: 'sampled_events_drop_pct',
pass: !requireBaselineHistory,
actual: 'n/a',
threshold: `<= ${maxSampledEventsDropPct}`,
});
} else {
const drop = Math.max(0, -delta);
criteria.push({
criterion: 'sampled_events_drop_pct',
pass: drop <= maxSampledEventsDropPct,
actual: `${Math.round(drop * 100) / 100}`,
threshold: `<= ${maxSampledEventsDropPct}`,
});
}
}
const maxRunOutcomesDropPct = readThreshold(thresholds.maxRunOutcomesDropPct);
if (typeof maxRunOutcomesDropPct === 'number') {
const delta = comparison.deltas.run_total_outcomes_pct;
if (delta === null) {
criteria.push({
criterion: 'run_outcomes_drop_pct',
pass: !requireBaselineHistory,
actual: 'n/a',
threshold: `<= ${maxRunOutcomesDropPct}`,
});
} else {
const drop = Math.max(0, -delta);
criteria.push({
criterion: 'run_outcomes_drop_pct',
pass: drop <= maxRunOutcomesDropPct,
actual: `${Math.round(drop * 100) / 100}`,
threshold: `<= ${maxRunOutcomesDropPct}`,
});
}
}
const maxCompletionRateDropPp = readThreshold(thresholds.maxCompletionRateDropPp);
if (typeof maxCompletionRateDropPp === 'number') {
const delta = comparison.deltas.completion_rate_pp;
if (delta === null) {
criteria.push({
criterion: 'completion_rate_drop_pp',
pass: !requireBaselineHistory,
actual: 'n/a',
threshold: `<= ${maxCompletionRateDropPp}`,
});
} else {
const drop = Math.max(0, -delta);
criteria.push({
criterion: 'completion_rate_drop_pp',
pass: drop <= maxCompletionRateDropPp,
actual: `${Math.round(drop * 100) / 100}`,
threshold: `<= ${maxCompletionRateDropPp}`,
});
}
}
const maxCancelRateIncreasePp = readThreshold(thresholds.maxCancelRateIncreasePp);
if (typeof maxCancelRateIncreasePp === 'number') {
const delta = comparison.deltas.cancel_rate_pp;
if (delta === null) {
criteria.push({
criterion: 'cancel_rate_increase_pp',
pass: !requireBaselineHistory,
actual: 'n/a',
threshold: `<= ${maxCancelRateIncreasePp}`,
});
} else {
const increase = Math.max(0, delta);
criteria.push({
criterion: 'cancel_rate_increase_pp',
pass: increase <= maxCancelRateIncreasePp,
actual: `${Math.round(increase * 100) / 100}`,
threshold: `<= ${maxCancelRateIncreasePp}`,
});
}
}
const maxErrorRateIncreasePp = readThreshold(thresholds.maxErrorRateIncreasePp);
if (typeof maxErrorRateIncreasePp === 'number') {
const delta = comparison.deltas.error_rate_pp;
if (delta === null) {
criteria.push({
criterion: 'error_rate_increase_pp',
pass: !requireBaselineHistory,
actual: 'n/a',
threshold: `<= ${maxErrorRateIncreasePp}`,
});
} else {
const increase = Math.max(0, delta);
criteria.push({
criterion: 'error_rate_increase_pp',
pass: increase <= maxErrorRateIncreasePp,
actual: `${Math.round(increase * 100) / 100}`,
threshold: `<= ${maxErrorRateIncreasePp}`,
});
}
}
const maxCancelLatencyP95IncreaseMs = readThreshold(thresholds.maxCancelLatencyP95IncreaseMs);
if (typeof maxCancelLatencyP95IncreaseMs === 'number') {
const delta = comparison.deltas.cancel_latency_p95_ms;
if (delta === null) {
criteria.push({
criterion: 'cancel_latency_p95_increase_ms',
pass: !requireBaselineHistory,
actual: 'n/a',
threshold: `<= ${maxCancelLatencyP95IncreaseMs}`,
});
} else {
const increase = Math.max(0, delta);
criteria.push({
criterion: 'cancel_latency_p95_increase_ms',
pass: increase <= maxCancelLatencyP95IncreaseMs,
actual: `${Math.round(increase * 100) / 100}`,
threshold: `<= ${maxCancelLatencyP95IncreaseMs}`,
});
}
}
return {
pass: criteria.every((row) => row.pass),
criteria,
};
}
export function renderPhase0BaselineDriftSnapshot(snapshot: Phase0BaselineDriftSnapshot): string {
return [
`sampled=${snapshot.sampled_event_count}`,
`outcomes=${snapshot.run_total_outcomes}`,
`completion=${formatNumber(snapshot.completion_rate_pct, '%')}`,
`cancel=${formatNumber(snapshot.cancel_rate_pct, '%')}`,
`error=${formatNumber(snapshot.error_rate_pct, '%')}`,
`cancel_p95_ms=${formatNumber(snapshot.cancel_latency_p95_ms)}`,
].join(' ');
}