feat(audit): add sample-size gate thresholds for canary evaluation

This commit is contained in:
William Valentin
2026-02-23 22:34:50 -08:00
parent 2d31f85c75
commit 34d1562ce8
3 changed files with 164 additions and 0 deletions
+112
View File
@@ -149,6 +149,63 @@ describe('summarizeBackendCanary', () => {
expect(summary.target.routes).toBe(1);
expect(summary.baseline.routes).toBe(0);
});
it('classifies pi-specific fallback reasons into stable categories', () => {
const events: AuditEvent[] = [
makeEvent(1000, 'backend.route', {
session_id: 'telegram:canary',
channel: 'telegram',
sender: '8367012007',
selected_backend: 'pi_embedded',
source: 'agent_override',
}),
makeEvent(1100, 'backend.fallback', {
session_id: 'telegram:canary',
channel: 'telegram',
sender: '8367012007',
from_backend: 'pi_embedded',
to_backend: 'native',
reason: 'Loaded Pi module does not expose a supported session factory (expected one of: createAgentSession)',
duration_ms: 100,
}),
makeEvent(1200, 'session.message', {
session_id: 'telegram:canary',
role: 'assistant',
content_length: 20,
}),
makeEvent(1300, 'backend.route', {
session_id: 'telegram:canary',
channel: 'telegram',
sender: '8367012007',
selected_backend: 'pi_embedded',
source: 'agent_override',
}),
makeEvent(1400, 'backend.fallback', {
session_id: 'telegram:canary',
channel: 'telegram',
sender: '8367012007',
from_backend: 'pi_embedded',
to_backend: 'native',
reason: 'Pi Agent runtime produced no assistant text',
duration_ms: 120,
}),
makeEvent(1500, 'session.message', {
session_id: 'telegram:canary',
role: 'assistant',
content_length: 20,
}),
];
const summary = summarizeBackendCanary(events, {
targetBackend: 'pi_embedded',
baselineBackend: 'native',
});
expect(summary.fallback_categories).toEqual([
{ category: 'empty_assistant_text', count: 1, pct: 50 },
{ category: 'pi_module_interface', count: 1, pct: 50 },
]);
});
});
describe('evaluateBackendCanaryGate', () => {
@@ -210,4 +267,59 @@ describe('evaluateBackendCanaryGate', () => {
expect(markdown).toContain('Pi Embedded Canary Summary');
expect(markdown).toContain('Gate result: PASS');
});
it('fails gate when minimum sample thresholds are not met', () => {
const events: AuditEvent[] = [
makeEvent(1000, 'backend.route', {
session_id: 's1',
channel: 'telegram',
sender: '1',
selected_backend: 'pi_embedded',
source: 'agent_override',
}),
makeEvent(1100, 'backend.success', {
session_id: 's1',
channel: 'telegram',
sender: '1',
backend: 'pi_embedded',
duration_ms: 100,
response_length: 8,
}),
makeEvent(1200, 'session.message', {
session_id: 's1',
role: 'assistant',
content_length: 20,
}),
];
const summary = summarizeBackendCanary(events, {
targetBackend: 'pi_embedded',
baselineBackend: 'native',
});
const gate = evaluateBackendCanaryGate(summary, {
minTargetRoutes: 3,
minBaselineRoutes: 2,
minTargetAttempts: 3,
maxFallbackRatePct: 5,
});
expect(gate.pass).toBe(false);
expect(gate.criteria).toHaveLength(4);
expect(gate.criteria[0]).toEqual(expect.objectContaining({
criterion: 'Minimum target routes',
pass: false,
actual: '1',
}));
expect(gate.criteria[1]).toEqual(expect.objectContaining({
criterion: 'Minimum baseline routes',
pass: false,
actual: '0',
}));
expect(gate.criteria[2]).toEqual(expect.objectContaining({
criterion: 'Minimum target external attempts',
pass: false,
actual: '1',
}));
});
});
+43
View File
@@ -78,6 +78,9 @@ export interface BackendCanarySummary {
}
export interface BackendCanaryGateThresholds {
minTargetRoutes?: number;
minBaselineRoutes?: number;
minTargetAttempts?: number;
maxCompletionRateDropPp?: number;
maxP50LatencyIncreaseMs?: number;
maxP95LatencyIncreaseMs?: number;
@@ -182,6 +185,18 @@ function normalizeFallbackCategory(reason: string): string {
if (!normalized) {
return 'unknown';
}
if (
normalized.includes('does not expose a supported session factory')
|| normalized.includes('expected one of: createagentsession')
) {
return 'pi_module_interface';
}
if (
normalized.includes('produced no assistant text')
|| normalized.includes('no assistant text')
) {
return 'empty_assistant_text';
}
if (normalized.includes('timeout') || normalized.includes('timed out')) {
return 'timeout';
}
@@ -470,6 +485,34 @@ export function evaluateBackendCanaryGate(
): BackendCanaryGateResult {
const criteria: BackendCanaryGateResult['criteria'] = [];
if (typeof thresholds.minTargetRoutes === 'number') {
criteria.push({
criterion: 'Minimum target routes',
pass: summary.target.routes >= thresholds.minTargetRoutes,
actual: `${summary.target.routes}`,
threshold: `>= ${thresholds.minTargetRoutes}`,
});
}
if (typeof thresholds.minBaselineRoutes === 'number') {
criteria.push({
criterion: 'Minimum baseline routes',
pass: summary.baseline.routes >= thresholds.minBaselineRoutes,
actual: `${summary.baseline.routes}`,
threshold: `>= ${thresholds.minBaselineRoutes}`,
});
}
if (typeof thresholds.minTargetAttempts === 'number') {
const attempts = summary.target_external_attempts?.attempts ?? 0;
criteria.push({
criterion: 'Minimum target external attempts',
pass: attempts >= thresholds.minTargetAttempts,
actual: `${attempts}`,
threshold: `>= ${thresholds.minTargetAttempts}`,
});
}
if (typeof thresholds.maxCompletionRateDropPp === 'number') {
const delta = summary.comparison.completion_rate_delta_pp;
const pass = delta !== null && delta >= (-thresholds.maxCompletionRateDropPp);