From 34d1562ce89b820343e40df76674683c60b5f740 Mon Sep 17 00:00:00 2001 From: William Valentin Date: Mon, 23 Feb 2026 22:34:50 -0800 Subject: [PATCH] feat(audit): add sample-size gate thresholds for canary evaluation --- scripts/summarize-backend-canary.ts | 9 ++ src/audit/backendCanarySummary.test.ts | 112 +++++++++++++++++++++++++ src/audit/backendCanarySummary.ts | 43 ++++++++++ 3 files changed, 164 insertions(+) diff --git a/scripts/summarize-backend-canary.ts b/scripts/summarize-backend-canary.ts index c7cb8d0..1f6f202 100755 --- a/scripts/summarize-backend-canary.ts +++ b/scripts/summarize-backend-canary.ts @@ -33,6 +33,9 @@ function usage(): string { ' --out Write output to file instead of stdout', '', 'Gate options (optional):', + ' --gate-min-target-routes ', + ' --gate-min-baseline-routes ', + ' --gate-min-target-attempts ', ' --gate-max-completion-drop-pp ', ' --gate-max-p50-latency-increase-ms ', ' --gate-max-p95-latency-increase-ms ', @@ -126,6 +129,9 @@ async function main(): Promise { source: { type: 'string' }, format: { type: 'string' }, out: { type: 'string' }, + 'gate-min-target-routes': { type: 'string' }, + 'gate-min-baseline-routes': { type: 'string' }, + 'gate-min-target-attempts': { type: 'string' }, 'gate-max-completion-drop-pp': { type: 'string' }, 'gate-max-p50-latency-increase-ms': { type: 'string' }, 'gate-max-p95-latency-increase-ms': { type: 'string' }, @@ -171,6 +177,9 @@ async function main(): Promise { const summary = summarizeBackendCanary(events, summaryOptions); const gateThresholds: BackendCanaryGateThresholds = { + minTargetRoutes: parseOptionalNumber(values['gate-min-target-routes'], '--gate-min-target-routes'), + minBaselineRoutes: parseOptionalNumber(values['gate-min-baseline-routes'], '--gate-min-baseline-routes'), + minTargetAttempts: parseOptionalNumber(values['gate-min-target-attempts'], '--gate-min-target-attempts'), maxCompletionRateDropPp: parseOptionalNumber(values['gate-max-completion-drop-pp'], '--gate-max-completion-drop-pp'), maxP50LatencyIncreaseMs: parseOptionalNumber(values['gate-max-p50-latency-increase-ms'], '--gate-max-p50-latency-increase-ms'), maxP95LatencyIncreaseMs: parseOptionalNumber(values['gate-max-p95-latency-increase-ms'], '--gate-max-p95-latency-increase-ms'), diff --git a/src/audit/backendCanarySummary.test.ts b/src/audit/backendCanarySummary.test.ts index 1b5e834..9141aa9 100644 --- a/src/audit/backendCanarySummary.test.ts +++ b/src/audit/backendCanarySummary.test.ts @@ -149,6 +149,63 @@ describe('summarizeBackendCanary', () => { expect(summary.target.routes).toBe(1); expect(summary.baseline.routes).toBe(0); }); + + it('classifies pi-specific fallback reasons into stable categories', () => { + const events: AuditEvent[] = [ + makeEvent(1000, 'backend.route', { + session_id: 'telegram:canary', + channel: 'telegram', + sender: '8367012007', + selected_backend: 'pi_embedded', + source: 'agent_override', + }), + makeEvent(1100, 'backend.fallback', { + session_id: 'telegram:canary', + channel: 'telegram', + sender: '8367012007', + from_backend: 'pi_embedded', + to_backend: 'native', + reason: 'Loaded Pi module does not expose a supported session factory (expected one of: createAgentSession)', + duration_ms: 100, + }), + makeEvent(1200, 'session.message', { + session_id: 'telegram:canary', + role: 'assistant', + content_length: 20, + }), + makeEvent(1300, 'backend.route', { + session_id: 'telegram:canary', + channel: 'telegram', + sender: '8367012007', + selected_backend: 'pi_embedded', + source: 'agent_override', + }), + makeEvent(1400, 'backend.fallback', { + session_id: 'telegram:canary', + channel: 'telegram', + sender: '8367012007', + from_backend: 'pi_embedded', + to_backend: 'native', + reason: 'Pi Agent runtime produced no assistant text', + duration_ms: 120, + }), + makeEvent(1500, 'session.message', { + session_id: 'telegram:canary', + role: 'assistant', + content_length: 20, + }), + ]; + + const summary = summarizeBackendCanary(events, { + targetBackend: 'pi_embedded', + baselineBackend: 'native', + }); + + expect(summary.fallback_categories).toEqual([ + { category: 'empty_assistant_text', count: 1, pct: 50 }, + { category: 'pi_module_interface', count: 1, pct: 50 }, + ]); + }); }); describe('evaluateBackendCanaryGate', () => { @@ -210,4 +267,59 @@ describe('evaluateBackendCanaryGate', () => { expect(markdown).toContain('Pi Embedded Canary Summary'); expect(markdown).toContain('Gate result: PASS'); }); + + it('fails gate when minimum sample thresholds are not met', () => { + const events: AuditEvent[] = [ + makeEvent(1000, 'backend.route', { + session_id: 's1', + channel: 'telegram', + sender: '1', + selected_backend: 'pi_embedded', + source: 'agent_override', + }), + makeEvent(1100, 'backend.success', { + session_id: 's1', + channel: 'telegram', + sender: '1', + backend: 'pi_embedded', + duration_ms: 100, + response_length: 8, + }), + makeEvent(1200, 'session.message', { + session_id: 's1', + role: 'assistant', + content_length: 20, + }), + ]; + + const summary = summarizeBackendCanary(events, { + targetBackend: 'pi_embedded', + baselineBackend: 'native', + }); + + const gate = evaluateBackendCanaryGate(summary, { + minTargetRoutes: 3, + minBaselineRoutes: 2, + minTargetAttempts: 3, + maxFallbackRatePct: 5, + }); + + expect(gate.pass).toBe(false); + expect(gate.criteria).toHaveLength(4); + expect(gate.criteria[0]).toEqual(expect.objectContaining({ + criterion: 'Minimum target routes', + pass: false, + actual: '1', + })); + expect(gate.criteria[1]).toEqual(expect.objectContaining({ + criterion: 'Minimum baseline routes', + pass: false, + actual: '0', + })); + expect(gate.criteria[2]).toEqual(expect.objectContaining({ + criterion: 'Minimum target external attempts', + pass: false, + actual: '1', + })); + }); }); diff --git a/src/audit/backendCanarySummary.ts b/src/audit/backendCanarySummary.ts index 8db7506..665438e 100644 --- a/src/audit/backendCanarySummary.ts +++ b/src/audit/backendCanarySummary.ts @@ -78,6 +78,9 @@ export interface BackendCanarySummary { } export interface BackendCanaryGateThresholds { + minTargetRoutes?: number; + minBaselineRoutes?: number; + minTargetAttempts?: number; maxCompletionRateDropPp?: number; maxP50LatencyIncreaseMs?: number; maxP95LatencyIncreaseMs?: number; @@ -182,6 +185,18 @@ function normalizeFallbackCategory(reason: string): string { if (!normalized) { return 'unknown'; } + if ( + normalized.includes('does not expose a supported session factory') + || normalized.includes('expected one of: createagentsession') + ) { + return 'pi_module_interface'; + } + if ( + normalized.includes('produced no assistant text') + || normalized.includes('no assistant text') + ) { + return 'empty_assistant_text'; + } if (normalized.includes('timeout') || normalized.includes('timed out')) { return 'timeout'; } @@ -470,6 +485,34 @@ export function evaluateBackendCanaryGate( ): BackendCanaryGateResult { const criteria: BackendCanaryGateResult['criteria'] = []; + if (typeof thresholds.minTargetRoutes === 'number') { + criteria.push({ + criterion: 'Minimum target routes', + pass: summary.target.routes >= thresholds.minTargetRoutes, + actual: `${summary.target.routes}`, + threshold: `>= ${thresholds.minTargetRoutes}`, + }); + } + + if (typeof thresholds.minBaselineRoutes === 'number') { + criteria.push({ + criterion: 'Minimum baseline routes', + pass: summary.baseline.routes >= thresholds.minBaselineRoutes, + actual: `${summary.baseline.routes}`, + threshold: `>= ${thresholds.minBaselineRoutes}`, + }); + } + + if (typeof thresholds.minTargetAttempts === 'number') { + const attempts = summary.target_external_attempts?.attempts ?? 0; + criteria.push({ + criterion: 'Minimum target external attempts', + pass: attempts >= thresholds.minTargetAttempts, + actual: `${attempts}`, + threshold: `>= ${thresholds.minTargetAttempts}`, + }); + } + if (typeof thresholds.maxCompletionRateDropPp === 'number') { const delta = summary.comparison.completion_rate_delta_pp; const pass = delta !== null && delta >= (-thresholds.maxCompletionRateDropPp);