From c5b310c852d055e279f1d91542c8a0f57c986844 Mon Sep 17 00:00:00 2001 From: William Valentin Date: Mon, 23 Feb 2026 22:38:14 -0800 Subject: [PATCH] feat(audit): add guard-coverage thresholds to canary gate --- scripts/summarize-backend-canary.ts | 9 +++ src/audit/backendCanarySummary.test.ts | 107 +++++++++++++++++++++++++ src/audit/backendCanarySummary.ts | 47 +++++++++++ 3 files changed, 163 insertions(+) diff --git a/scripts/summarize-backend-canary.ts b/scripts/summarize-backend-canary.ts index 1f6f202..3752700 100755 --- a/scripts/summarize-backend-canary.ts +++ b/scripts/summarize-backend-canary.ts @@ -36,6 +36,9 @@ function usage(): string { ' --gate-min-target-routes ', ' --gate-min-baseline-routes ', ' --gate-min-target-attempts ', + ' --gate-min-guard-pi-no-tools-count ', + ' --gate-min-guard-capability-query-count ', + ' --gate-min-guard-attachments-present-count ', ' --gate-max-completion-drop-pp ', ' --gate-max-p50-latency-increase-ms ', ' --gate-max-p95-latency-increase-ms ', @@ -132,6 +135,9 @@ async function main(): Promise { 'gate-min-target-routes': { type: 'string' }, 'gate-min-baseline-routes': { type: 'string' }, 'gate-min-target-attempts': { type: 'string' }, + 'gate-min-guard-pi-no-tools-count': { type: 'string' }, + 'gate-min-guard-capability-query-count': { type: 'string' }, + 'gate-min-guard-attachments-present-count': { type: 'string' }, 'gate-max-completion-drop-pp': { type: 'string' }, 'gate-max-p50-latency-increase-ms': { type: 'string' }, 'gate-max-p95-latency-increase-ms': { type: 'string' }, @@ -180,6 +186,9 @@ async function main(): Promise { minTargetRoutes: parseOptionalNumber(values['gate-min-target-routes'], '--gate-min-target-routes'), minBaselineRoutes: parseOptionalNumber(values['gate-min-baseline-routes'], '--gate-min-baseline-routes'), minTargetAttempts: parseOptionalNumber(values['gate-min-target-attempts'], '--gate-min-target-attempts'), + minGuardPiNoToolsCount: parseOptionalNumber(values['gate-min-guard-pi-no-tools-count'], '--gate-min-guard-pi-no-tools-count'), + minGuardCapabilityQueryCount: parseOptionalNumber(values['gate-min-guard-capability-query-count'], '--gate-min-guard-capability-query-count'), + minGuardAttachmentsPresentCount: parseOptionalNumber(values['gate-min-guard-attachments-present-count'], '--gate-min-guard-attachments-present-count'), maxCompletionRateDropPp: parseOptionalNumber(values['gate-max-completion-drop-pp'], '--gate-max-completion-drop-pp'), maxP50LatencyIncreaseMs: parseOptionalNumber(values['gate-max-p50-latency-increase-ms'], '--gate-max-p50-latency-increase-ms'), maxP95LatencyIncreaseMs: parseOptionalNumber(values['gate-max-p95-latency-increase-ms'], '--gate-max-p95-latency-increase-ms'), diff --git a/src/audit/backendCanarySummary.test.ts b/src/audit/backendCanarySummary.test.ts index 9141aa9..3a99b8b 100644 --- a/src/audit/backendCanarySummary.test.ts +++ b/src/audit/backendCanarySummary.test.ts @@ -206,6 +206,52 @@ describe('summarizeBackendCanary', () => { { category: 'pi_module_interface', count: 1, pct: 50 }, ]); }); + + it('tracks forced-native guard reasons', () => { + const events: AuditEvent[] = [ + makeEvent(1000, 'backend.route', { + session_id: 's1', + channel: 'telegram', + sender: '1', + selected_backend: 'native', + source: 'forced_native_guard', + guard_reason: 'pi_no_tools_mode', + }), + makeEvent(1010, 'session.message', { + session_id: 's1', + role: 'assistant', + content_length: 10, + }), + makeEvent(2000, 'backend.route', { + session_id: 's2', + channel: 'telegram', + sender: '2', + selected_backend: 'native', + source: 'forced_native_guard', + guard_reason: 'capability_query', + }), + makeEvent(2010, 'session.message', { + session_id: 's2', + role: 'assistant', + content_length: 10, + }), + ]; + + const summary = summarizeBackendCanary(events, { + targetBackend: 'pi_embedded', + baselineBackend: 'native', + }); + + expect(summary.route_stats.forced_native_guards.pi_no_tools_mode).toBe(1); + expect(summary.route_stats.forced_native_guards.capability_query).toBe(1); + + const markdown = renderBackendCanaryMarkdown(summary, { + targetBackend: 'pi_embedded', + baselineBackend: 'native', + }); + expect(markdown).toContain('Forced Native Guards'); + expect(markdown).toContain('pi_no_tools_mode'); + }); }); describe('evaluateBackendCanaryGate', () => { @@ -322,4 +368,65 @@ describe('evaluateBackendCanaryGate', () => { actual: '1', })); }); + + it('evaluates guard coverage thresholds', () => { + const events: AuditEvent[] = [ + makeEvent(1000, 'backend.route', { + session_id: 's1', + channel: 'telegram', + sender: '1', + selected_backend: 'native', + source: 'forced_native_guard', + guard_reason: 'pi_no_tools_mode', + }), + makeEvent(1010, 'session.message', { + session_id: 's1', + role: 'assistant', + content_length: 20, + }), + makeEvent(2000, 'backend.route', { + session_id: 's2', + channel: 'telegram', + sender: '2', + selected_backend: 'native', + source: 'forced_native_guard', + guard_reason: 'attachments_present', + }), + makeEvent(2010, 'session.message', { + session_id: 's2', + role: 'assistant', + content_length: 20, + }), + ]; + + const summary = summarizeBackendCanary(events, { + targetBackend: 'pi_embedded', + baselineBackend: 'native', + }); + + const gate = evaluateBackendCanaryGate(summary, { + minGuardPiNoToolsCount: 1, + minGuardCapabilityQueryCount: 1, + minGuardAttachmentsPresentCount: 1, + }); + + expect(gate.pass).toBe(false); + expect(gate.criteria).toEqual([ + expect.objectContaining({ + criterion: 'Minimum pi_no_tools_mode guard hits', + pass: true, + actual: '1', + }), + expect.objectContaining({ + criterion: 'Minimum capability_query guard hits', + pass: false, + actual: '0', + }), + expect.objectContaining({ + criterion: 'Minimum attachments_present guard hits', + pass: true, + actual: '1', + }), + ]); + }); }); diff --git a/src/audit/backendCanarySummary.ts b/src/audit/backendCanarySummary.ts index 665438e..d45cedc 100644 --- a/src/audit/backendCanarySummary.ts +++ b/src/audit/backendCanarySummary.ts @@ -81,6 +81,9 @@ export interface BackendCanaryGateThresholds { minTargetRoutes?: number; minBaselineRoutes?: number; minTargetAttempts?: number; + minGuardPiNoToolsCount?: number; + minGuardCapabilityQueryCount?: number; + minGuardAttachmentsPresentCount?: number; maxCompletionRateDropPp?: number; maxP50LatencyIncreaseMs?: number; maxP95LatencyIncreaseMs?: number; @@ -513,6 +516,36 @@ export function evaluateBackendCanaryGate( }); } + if (typeof thresholds.minGuardPiNoToolsCount === 'number') { + const count = summary.route_stats.forced_native_guards.pi_no_tools_mode ?? 0; + criteria.push({ + criterion: 'Minimum pi_no_tools_mode guard hits', + pass: count >= thresholds.minGuardPiNoToolsCount, + actual: `${count}`, + threshold: `>= ${thresholds.minGuardPiNoToolsCount}`, + }); + } + + if (typeof thresholds.minGuardCapabilityQueryCount === 'number') { + const count = summary.route_stats.forced_native_guards.capability_query ?? 0; + criteria.push({ + criterion: 'Minimum capability_query guard hits', + pass: count >= thresholds.minGuardCapabilityQueryCount, + actual: `${count}`, + threshold: `>= ${thresholds.minGuardCapabilityQueryCount}`, + }); + } + + if (typeof thresholds.minGuardAttachmentsPresentCount === 'number') { + const count = summary.route_stats.forced_native_guards.attachments_present ?? 0; + criteria.push({ + criterion: 'Minimum attachments_present guard hits', + pass: count >= thresholds.minGuardAttachmentsPresentCount, + actual: `${count}`, + threshold: `>= ${thresholds.minGuardAttachmentsPresentCount}`, + }); + } + if (typeof thresholds.maxCompletionRateDropPp === 'number') { const delta = summary.comparison.completion_rate_delta_pp; const pass = delta !== null && delta >= (-thresholds.maxCompletionRateDropPp); @@ -600,6 +633,20 @@ export function renderBackendCanaryMarkdown( } lines.push(''); + lines.push('### Forced Native Guards'); + lines.push(''); + lines.push('| Guard reason | Count |'); + lines.push('| --- | ---: |'); + const guardRows = Object.entries(summary.route_stats.forced_native_guards) + .sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0])); + for (const [reason, count] of guardRows) { + lines.push(`| ${reason} | ${count} |`); + } + if (guardRows.length === 0) { + lines.push('| _none_ | 0 |'); + } + lines.push(''); + lines.push('## Reliability'); lines.push(''); lines.push('| Metric | Target | Baseline | Delta |');