feat(audit): add guard-coverage thresholds to canary gate

This commit is contained in:
William Valentin
2026-02-23 22:38:14 -08:00
parent 2d42f65b9f
commit c5b310c852
3 changed files with 163 additions and 0 deletions
+9
View File
@@ -36,6 +36,9 @@ function usage(): string {
' --gate-min-target-routes <number>',
' --gate-min-baseline-routes <number>',
' --gate-min-target-attempts <number>',
' --gate-min-guard-pi-no-tools-count <number>',
' --gate-min-guard-capability-query-count <number>',
' --gate-min-guard-attachments-present-count <number>',
' --gate-max-completion-drop-pp <number>',
' --gate-max-p50-latency-increase-ms <number>',
' --gate-max-p95-latency-increase-ms <number>',
@@ -132,6 +135,9 @@ async function main(): Promise<void> {
'gate-min-target-routes': { type: 'string' },
'gate-min-baseline-routes': { type: 'string' },
'gate-min-target-attempts': { type: 'string' },
'gate-min-guard-pi-no-tools-count': { type: 'string' },
'gate-min-guard-capability-query-count': { type: 'string' },
'gate-min-guard-attachments-present-count': { type: 'string' },
'gate-max-completion-drop-pp': { type: 'string' },
'gate-max-p50-latency-increase-ms': { type: 'string' },
'gate-max-p95-latency-increase-ms': { type: 'string' },
@@ -180,6 +186,9 @@ async function main(): Promise<void> {
minTargetRoutes: parseOptionalNumber(values['gate-min-target-routes'], '--gate-min-target-routes'),
minBaselineRoutes: parseOptionalNumber(values['gate-min-baseline-routes'], '--gate-min-baseline-routes'),
minTargetAttempts: parseOptionalNumber(values['gate-min-target-attempts'], '--gate-min-target-attempts'),
minGuardPiNoToolsCount: parseOptionalNumber(values['gate-min-guard-pi-no-tools-count'], '--gate-min-guard-pi-no-tools-count'),
minGuardCapabilityQueryCount: parseOptionalNumber(values['gate-min-guard-capability-query-count'], '--gate-min-guard-capability-query-count'),
minGuardAttachmentsPresentCount: parseOptionalNumber(values['gate-min-guard-attachments-present-count'], '--gate-min-guard-attachments-present-count'),
maxCompletionRateDropPp: parseOptionalNumber(values['gate-max-completion-drop-pp'], '--gate-max-completion-drop-pp'),
maxP50LatencyIncreaseMs: parseOptionalNumber(values['gate-max-p50-latency-increase-ms'], '--gate-max-p50-latency-increase-ms'),
maxP95LatencyIncreaseMs: parseOptionalNumber(values['gate-max-p95-latency-increase-ms'], '--gate-max-p95-latency-increase-ms'),
+107
View File
@@ -206,6 +206,52 @@ describe('summarizeBackendCanary', () => {
{ category: 'pi_module_interface', count: 1, pct: 50 },
]);
});
it('tracks forced-native guard reasons', () => {
const events: AuditEvent[] = [
makeEvent(1000, 'backend.route', {
session_id: 's1',
channel: 'telegram',
sender: '1',
selected_backend: 'native',
source: 'forced_native_guard',
guard_reason: 'pi_no_tools_mode',
}),
makeEvent(1010, 'session.message', {
session_id: 's1',
role: 'assistant',
content_length: 10,
}),
makeEvent(2000, 'backend.route', {
session_id: 's2',
channel: 'telegram',
sender: '2',
selected_backend: 'native',
source: 'forced_native_guard',
guard_reason: 'capability_query',
}),
makeEvent(2010, 'session.message', {
session_id: 's2',
role: 'assistant',
content_length: 10,
}),
];
const summary = summarizeBackendCanary(events, {
targetBackend: 'pi_embedded',
baselineBackend: 'native',
});
expect(summary.route_stats.forced_native_guards.pi_no_tools_mode).toBe(1);
expect(summary.route_stats.forced_native_guards.capability_query).toBe(1);
const markdown = renderBackendCanaryMarkdown(summary, {
targetBackend: 'pi_embedded',
baselineBackend: 'native',
});
expect(markdown).toContain('Forced Native Guards');
expect(markdown).toContain('pi_no_tools_mode');
});
});
describe('evaluateBackendCanaryGate', () => {
@@ -322,4 +368,65 @@ describe('evaluateBackendCanaryGate', () => {
actual: '1',
}));
});
it('evaluates guard coverage thresholds', () => {
const events: AuditEvent[] = [
makeEvent(1000, 'backend.route', {
session_id: 's1',
channel: 'telegram',
sender: '1',
selected_backend: 'native',
source: 'forced_native_guard',
guard_reason: 'pi_no_tools_mode',
}),
makeEvent(1010, 'session.message', {
session_id: 's1',
role: 'assistant',
content_length: 20,
}),
makeEvent(2000, 'backend.route', {
session_id: 's2',
channel: 'telegram',
sender: '2',
selected_backend: 'native',
source: 'forced_native_guard',
guard_reason: 'attachments_present',
}),
makeEvent(2010, 'session.message', {
session_id: 's2',
role: 'assistant',
content_length: 20,
}),
];
const summary = summarizeBackendCanary(events, {
targetBackend: 'pi_embedded',
baselineBackend: 'native',
});
const gate = evaluateBackendCanaryGate(summary, {
minGuardPiNoToolsCount: 1,
minGuardCapabilityQueryCount: 1,
minGuardAttachmentsPresentCount: 1,
});
expect(gate.pass).toBe(false);
expect(gate.criteria).toEqual([
expect.objectContaining({
criterion: 'Minimum pi_no_tools_mode guard hits',
pass: true,
actual: '1',
}),
expect.objectContaining({
criterion: 'Minimum capability_query guard hits',
pass: false,
actual: '0',
}),
expect.objectContaining({
criterion: 'Minimum attachments_present guard hits',
pass: true,
actual: '1',
}),
]);
});
});
+47
View File
@@ -81,6 +81,9 @@ export interface BackendCanaryGateThresholds {
minTargetRoutes?: number;
minBaselineRoutes?: number;
minTargetAttempts?: number;
minGuardPiNoToolsCount?: number;
minGuardCapabilityQueryCount?: number;
minGuardAttachmentsPresentCount?: number;
maxCompletionRateDropPp?: number;
maxP50LatencyIncreaseMs?: number;
maxP95LatencyIncreaseMs?: number;
@@ -513,6 +516,36 @@ export function evaluateBackendCanaryGate(
});
}
if (typeof thresholds.minGuardPiNoToolsCount === 'number') {
const count = summary.route_stats.forced_native_guards.pi_no_tools_mode ?? 0;
criteria.push({
criterion: 'Minimum pi_no_tools_mode guard hits',
pass: count >= thresholds.minGuardPiNoToolsCount,
actual: `${count}`,
threshold: `>= ${thresholds.minGuardPiNoToolsCount}`,
});
}
if (typeof thresholds.minGuardCapabilityQueryCount === 'number') {
const count = summary.route_stats.forced_native_guards.capability_query ?? 0;
criteria.push({
criterion: 'Minimum capability_query guard hits',
pass: count >= thresholds.minGuardCapabilityQueryCount,
actual: `${count}`,
threshold: `>= ${thresholds.minGuardCapabilityQueryCount}`,
});
}
if (typeof thresholds.minGuardAttachmentsPresentCount === 'number') {
const count = summary.route_stats.forced_native_guards.attachments_present ?? 0;
criteria.push({
criterion: 'Minimum attachments_present guard hits',
pass: count >= thresholds.minGuardAttachmentsPresentCount,
actual: `${count}`,
threshold: `>= ${thresholds.minGuardAttachmentsPresentCount}`,
});
}
if (typeof thresholds.maxCompletionRateDropPp === 'number') {
const delta = summary.comparison.completion_rate_delta_pp;
const pass = delta !== null && delta >= (-thresholds.maxCompletionRateDropPp);
@@ -600,6 +633,20 @@ export function renderBackendCanaryMarkdown(
}
lines.push('');
lines.push('### Forced Native Guards');
lines.push('');
lines.push('| Guard reason | Count |');
lines.push('| --- | ---: |');
const guardRows = Object.entries(summary.route_stats.forced_native_guards)
.sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]));
for (const [reason, count] of guardRows) {
lines.push(`| ${reason} | ${count} |`);
}
if (guardRows.length === 0) {
lines.push('| _none_ | 0 |');
}
lines.push('');
lines.push('## Reliability');
lines.push('');
lines.push('| Metric | Target | Baseline | Delta |');