feat(audit): add guard-coverage thresholds to canary gate
This commit is contained in:
@@ -206,6 +206,52 @@ describe('summarizeBackendCanary', () => {
|
||||
{ category: 'pi_module_interface', count: 1, pct: 50 },
|
||||
]);
|
||||
});
|
||||
|
||||
it('tracks forced-native guard reasons', () => {
|
||||
const events: AuditEvent[] = [
|
||||
makeEvent(1000, 'backend.route', {
|
||||
session_id: 's1',
|
||||
channel: 'telegram',
|
||||
sender: '1',
|
||||
selected_backend: 'native',
|
||||
source: 'forced_native_guard',
|
||||
guard_reason: 'pi_no_tools_mode',
|
||||
}),
|
||||
makeEvent(1010, 'session.message', {
|
||||
session_id: 's1',
|
||||
role: 'assistant',
|
||||
content_length: 10,
|
||||
}),
|
||||
makeEvent(2000, 'backend.route', {
|
||||
session_id: 's2',
|
||||
channel: 'telegram',
|
||||
sender: '2',
|
||||
selected_backend: 'native',
|
||||
source: 'forced_native_guard',
|
||||
guard_reason: 'capability_query',
|
||||
}),
|
||||
makeEvent(2010, 'session.message', {
|
||||
session_id: 's2',
|
||||
role: 'assistant',
|
||||
content_length: 10,
|
||||
}),
|
||||
];
|
||||
|
||||
const summary = summarizeBackendCanary(events, {
|
||||
targetBackend: 'pi_embedded',
|
||||
baselineBackend: 'native',
|
||||
});
|
||||
|
||||
expect(summary.route_stats.forced_native_guards.pi_no_tools_mode).toBe(1);
|
||||
expect(summary.route_stats.forced_native_guards.capability_query).toBe(1);
|
||||
|
||||
const markdown = renderBackendCanaryMarkdown(summary, {
|
||||
targetBackend: 'pi_embedded',
|
||||
baselineBackend: 'native',
|
||||
});
|
||||
expect(markdown).toContain('Forced Native Guards');
|
||||
expect(markdown).toContain('pi_no_tools_mode');
|
||||
});
|
||||
});
|
||||
|
||||
describe('evaluateBackendCanaryGate', () => {
|
||||
@@ -322,4 +368,65 @@ describe('evaluateBackendCanaryGate', () => {
|
||||
actual: '1',
|
||||
}));
|
||||
});
|
||||
|
||||
it('evaluates guard coverage thresholds', () => {
|
||||
const events: AuditEvent[] = [
|
||||
makeEvent(1000, 'backend.route', {
|
||||
session_id: 's1',
|
||||
channel: 'telegram',
|
||||
sender: '1',
|
||||
selected_backend: 'native',
|
||||
source: 'forced_native_guard',
|
||||
guard_reason: 'pi_no_tools_mode',
|
||||
}),
|
||||
makeEvent(1010, 'session.message', {
|
||||
session_id: 's1',
|
||||
role: 'assistant',
|
||||
content_length: 20,
|
||||
}),
|
||||
makeEvent(2000, 'backend.route', {
|
||||
session_id: 's2',
|
||||
channel: 'telegram',
|
||||
sender: '2',
|
||||
selected_backend: 'native',
|
||||
source: 'forced_native_guard',
|
||||
guard_reason: 'attachments_present',
|
||||
}),
|
||||
makeEvent(2010, 'session.message', {
|
||||
session_id: 's2',
|
||||
role: 'assistant',
|
||||
content_length: 20,
|
||||
}),
|
||||
];
|
||||
|
||||
const summary = summarizeBackendCanary(events, {
|
||||
targetBackend: 'pi_embedded',
|
||||
baselineBackend: 'native',
|
||||
});
|
||||
|
||||
const gate = evaluateBackendCanaryGate(summary, {
|
||||
minGuardPiNoToolsCount: 1,
|
||||
minGuardCapabilityQueryCount: 1,
|
||||
minGuardAttachmentsPresentCount: 1,
|
||||
});
|
||||
|
||||
expect(gate.pass).toBe(false);
|
||||
expect(gate.criteria).toEqual([
|
||||
expect.objectContaining({
|
||||
criterion: 'Minimum pi_no_tools_mode guard hits',
|
||||
pass: true,
|
||||
actual: '1',
|
||||
}),
|
||||
expect.objectContaining({
|
||||
criterion: 'Minimum capability_query guard hits',
|
||||
pass: false,
|
||||
actual: '0',
|
||||
}),
|
||||
expect.objectContaining({
|
||||
criterion: 'Minimum attachments_present guard hits',
|
||||
pass: true,
|
||||
actual: '1',
|
||||
}),
|
||||
]);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -81,6 +81,9 @@ export interface BackendCanaryGateThresholds {
|
||||
minTargetRoutes?: number;
|
||||
minBaselineRoutes?: number;
|
||||
minTargetAttempts?: number;
|
||||
minGuardPiNoToolsCount?: number;
|
||||
minGuardCapabilityQueryCount?: number;
|
||||
minGuardAttachmentsPresentCount?: number;
|
||||
maxCompletionRateDropPp?: number;
|
||||
maxP50LatencyIncreaseMs?: number;
|
||||
maxP95LatencyIncreaseMs?: number;
|
||||
@@ -513,6 +516,36 @@ export function evaluateBackendCanaryGate(
|
||||
});
|
||||
}
|
||||
|
||||
if (typeof thresholds.minGuardPiNoToolsCount === 'number') {
|
||||
const count = summary.route_stats.forced_native_guards.pi_no_tools_mode ?? 0;
|
||||
criteria.push({
|
||||
criterion: 'Minimum pi_no_tools_mode guard hits',
|
||||
pass: count >= thresholds.minGuardPiNoToolsCount,
|
||||
actual: `${count}`,
|
||||
threshold: `>= ${thresholds.minGuardPiNoToolsCount}`,
|
||||
});
|
||||
}
|
||||
|
||||
if (typeof thresholds.minGuardCapabilityQueryCount === 'number') {
|
||||
const count = summary.route_stats.forced_native_guards.capability_query ?? 0;
|
||||
criteria.push({
|
||||
criterion: 'Minimum capability_query guard hits',
|
||||
pass: count >= thresholds.minGuardCapabilityQueryCount,
|
||||
actual: `${count}`,
|
||||
threshold: `>= ${thresholds.minGuardCapabilityQueryCount}`,
|
||||
});
|
||||
}
|
||||
|
||||
if (typeof thresholds.minGuardAttachmentsPresentCount === 'number') {
|
||||
const count = summary.route_stats.forced_native_guards.attachments_present ?? 0;
|
||||
criteria.push({
|
||||
criterion: 'Minimum attachments_present guard hits',
|
||||
pass: count >= thresholds.minGuardAttachmentsPresentCount,
|
||||
actual: `${count}`,
|
||||
threshold: `>= ${thresholds.minGuardAttachmentsPresentCount}`,
|
||||
});
|
||||
}
|
||||
|
||||
if (typeof thresholds.maxCompletionRateDropPp === 'number') {
|
||||
const delta = summary.comparison.completion_rate_delta_pp;
|
||||
const pass = delta !== null && delta >= (-thresholds.maxCompletionRateDropPp);
|
||||
@@ -600,6 +633,20 @@ export function renderBackendCanaryMarkdown(
|
||||
}
|
||||
lines.push('');
|
||||
|
||||
lines.push('### Forced Native Guards');
|
||||
lines.push('');
|
||||
lines.push('| Guard reason | Count |');
|
||||
lines.push('| --- | ---: |');
|
||||
const guardRows = Object.entries(summary.route_stats.forced_native_guards)
|
||||
.sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]));
|
||||
for (const [reason, count] of guardRows) {
|
||||
lines.push(`| ${reason} | ${count} |`);
|
||||
}
|
||||
if (guardRows.length === 0) {
|
||||
lines.push('| _none_ | 0 |');
|
||||
}
|
||||
lines.push('');
|
||||
|
||||
lines.push('## Reliability');
|
||||
lines.push('');
|
||||
lines.push('| Metric | Target | Baseline | Delta |');
|
||||
|
||||
Reference in New Issue
Block a user