feat(audit): add guard-coverage thresholds to canary gate
This commit is contained in:
@@ -36,6 +36,9 @@ function usage(): string {
|
|||||||
' --gate-min-target-routes <number>',
|
' --gate-min-target-routes <number>',
|
||||||
' --gate-min-baseline-routes <number>',
|
' --gate-min-baseline-routes <number>',
|
||||||
' --gate-min-target-attempts <number>',
|
' --gate-min-target-attempts <number>',
|
||||||
|
' --gate-min-guard-pi-no-tools-count <number>',
|
||||||
|
' --gate-min-guard-capability-query-count <number>',
|
||||||
|
' --gate-min-guard-attachments-present-count <number>',
|
||||||
' --gate-max-completion-drop-pp <number>',
|
' --gate-max-completion-drop-pp <number>',
|
||||||
' --gate-max-p50-latency-increase-ms <number>',
|
' --gate-max-p50-latency-increase-ms <number>',
|
||||||
' --gate-max-p95-latency-increase-ms <number>',
|
' --gate-max-p95-latency-increase-ms <number>',
|
||||||
@@ -132,6 +135,9 @@ async function main(): Promise<void> {
|
|||||||
'gate-min-target-routes': { type: 'string' },
|
'gate-min-target-routes': { type: 'string' },
|
||||||
'gate-min-baseline-routes': { type: 'string' },
|
'gate-min-baseline-routes': { type: 'string' },
|
||||||
'gate-min-target-attempts': { type: 'string' },
|
'gate-min-target-attempts': { type: 'string' },
|
||||||
|
'gate-min-guard-pi-no-tools-count': { type: 'string' },
|
||||||
|
'gate-min-guard-capability-query-count': { type: 'string' },
|
||||||
|
'gate-min-guard-attachments-present-count': { type: 'string' },
|
||||||
'gate-max-completion-drop-pp': { type: 'string' },
|
'gate-max-completion-drop-pp': { type: 'string' },
|
||||||
'gate-max-p50-latency-increase-ms': { type: 'string' },
|
'gate-max-p50-latency-increase-ms': { type: 'string' },
|
||||||
'gate-max-p95-latency-increase-ms': { type: 'string' },
|
'gate-max-p95-latency-increase-ms': { type: 'string' },
|
||||||
@@ -180,6 +186,9 @@ async function main(): Promise<void> {
|
|||||||
minTargetRoutes: parseOptionalNumber(values['gate-min-target-routes'], '--gate-min-target-routes'),
|
minTargetRoutes: parseOptionalNumber(values['gate-min-target-routes'], '--gate-min-target-routes'),
|
||||||
minBaselineRoutes: parseOptionalNumber(values['gate-min-baseline-routes'], '--gate-min-baseline-routes'),
|
minBaselineRoutes: parseOptionalNumber(values['gate-min-baseline-routes'], '--gate-min-baseline-routes'),
|
||||||
minTargetAttempts: parseOptionalNumber(values['gate-min-target-attempts'], '--gate-min-target-attempts'),
|
minTargetAttempts: parseOptionalNumber(values['gate-min-target-attempts'], '--gate-min-target-attempts'),
|
||||||
|
minGuardPiNoToolsCount: parseOptionalNumber(values['gate-min-guard-pi-no-tools-count'], '--gate-min-guard-pi-no-tools-count'),
|
||||||
|
minGuardCapabilityQueryCount: parseOptionalNumber(values['gate-min-guard-capability-query-count'], '--gate-min-guard-capability-query-count'),
|
||||||
|
minGuardAttachmentsPresentCount: parseOptionalNumber(values['gate-min-guard-attachments-present-count'], '--gate-min-guard-attachments-present-count'),
|
||||||
maxCompletionRateDropPp: parseOptionalNumber(values['gate-max-completion-drop-pp'], '--gate-max-completion-drop-pp'),
|
maxCompletionRateDropPp: parseOptionalNumber(values['gate-max-completion-drop-pp'], '--gate-max-completion-drop-pp'),
|
||||||
maxP50LatencyIncreaseMs: parseOptionalNumber(values['gate-max-p50-latency-increase-ms'], '--gate-max-p50-latency-increase-ms'),
|
maxP50LatencyIncreaseMs: parseOptionalNumber(values['gate-max-p50-latency-increase-ms'], '--gate-max-p50-latency-increase-ms'),
|
||||||
maxP95LatencyIncreaseMs: parseOptionalNumber(values['gate-max-p95-latency-increase-ms'], '--gate-max-p95-latency-increase-ms'),
|
maxP95LatencyIncreaseMs: parseOptionalNumber(values['gate-max-p95-latency-increase-ms'], '--gate-max-p95-latency-increase-ms'),
|
||||||
|
|||||||
@@ -206,6 +206,52 @@ describe('summarizeBackendCanary', () => {
|
|||||||
{ category: 'pi_module_interface', count: 1, pct: 50 },
|
{ category: 'pi_module_interface', count: 1, pct: 50 },
|
||||||
]);
|
]);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('tracks forced-native guard reasons', () => {
|
||||||
|
const events: AuditEvent[] = [
|
||||||
|
makeEvent(1000, 'backend.route', {
|
||||||
|
session_id: 's1',
|
||||||
|
channel: 'telegram',
|
||||||
|
sender: '1',
|
||||||
|
selected_backend: 'native',
|
||||||
|
source: 'forced_native_guard',
|
||||||
|
guard_reason: 'pi_no_tools_mode',
|
||||||
|
}),
|
||||||
|
makeEvent(1010, 'session.message', {
|
||||||
|
session_id: 's1',
|
||||||
|
role: 'assistant',
|
||||||
|
content_length: 10,
|
||||||
|
}),
|
||||||
|
makeEvent(2000, 'backend.route', {
|
||||||
|
session_id: 's2',
|
||||||
|
channel: 'telegram',
|
||||||
|
sender: '2',
|
||||||
|
selected_backend: 'native',
|
||||||
|
source: 'forced_native_guard',
|
||||||
|
guard_reason: 'capability_query',
|
||||||
|
}),
|
||||||
|
makeEvent(2010, 'session.message', {
|
||||||
|
session_id: 's2',
|
||||||
|
role: 'assistant',
|
||||||
|
content_length: 10,
|
||||||
|
}),
|
||||||
|
];
|
||||||
|
|
||||||
|
const summary = summarizeBackendCanary(events, {
|
||||||
|
targetBackend: 'pi_embedded',
|
||||||
|
baselineBackend: 'native',
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(summary.route_stats.forced_native_guards.pi_no_tools_mode).toBe(1);
|
||||||
|
expect(summary.route_stats.forced_native_guards.capability_query).toBe(1);
|
||||||
|
|
||||||
|
const markdown = renderBackendCanaryMarkdown(summary, {
|
||||||
|
targetBackend: 'pi_embedded',
|
||||||
|
baselineBackend: 'native',
|
||||||
|
});
|
||||||
|
expect(markdown).toContain('Forced Native Guards');
|
||||||
|
expect(markdown).toContain('pi_no_tools_mode');
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('evaluateBackendCanaryGate', () => {
|
describe('evaluateBackendCanaryGate', () => {
|
||||||
@@ -322,4 +368,65 @@ describe('evaluateBackendCanaryGate', () => {
|
|||||||
actual: '1',
|
actual: '1',
|
||||||
}));
|
}));
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('evaluates guard coverage thresholds', () => {
|
||||||
|
const events: AuditEvent[] = [
|
||||||
|
makeEvent(1000, 'backend.route', {
|
||||||
|
session_id: 's1',
|
||||||
|
channel: 'telegram',
|
||||||
|
sender: '1',
|
||||||
|
selected_backend: 'native',
|
||||||
|
source: 'forced_native_guard',
|
||||||
|
guard_reason: 'pi_no_tools_mode',
|
||||||
|
}),
|
||||||
|
makeEvent(1010, 'session.message', {
|
||||||
|
session_id: 's1',
|
||||||
|
role: 'assistant',
|
||||||
|
content_length: 20,
|
||||||
|
}),
|
||||||
|
makeEvent(2000, 'backend.route', {
|
||||||
|
session_id: 's2',
|
||||||
|
channel: 'telegram',
|
||||||
|
sender: '2',
|
||||||
|
selected_backend: 'native',
|
||||||
|
source: 'forced_native_guard',
|
||||||
|
guard_reason: 'attachments_present',
|
||||||
|
}),
|
||||||
|
makeEvent(2010, 'session.message', {
|
||||||
|
session_id: 's2',
|
||||||
|
role: 'assistant',
|
||||||
|
content_length: 20,
|
||||||
|
}),
|
||||||
|
];
|
||||||
|
|
||||||
|
const summary = summarizeBackendCanary(events, {
|
||||||
|
targetBackend: 'pi_embedded',
|
||||||
|
baselineBackend: 'native',
|
||||||
|
});
|
||||||
|
|
||||||
|
const gate = evaluateBackendCanaryGate(summary, {
|
||||||
|
minGuardPiNoToolsCount: 1,
|
||||||
|
minGuardCapabilityQueryCount: 1,
|
||||||
|
minGuardAttachmentsPresentCount: 1,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(gate.pass).toBe(false);
|
||||||
|
expect(gate.criteria).toEqual([
|
||||||
|
expect.objectContaining({
|
||||||
|
criterion: 'Minimum pi_no_tools_mode guard hits',
|
||||||
|
pass: true,
|
||||||
|
actual: '1',
|
||||||
|
}),
|
||||||
|
expect.objectContaining({
|
||||||
|
criterion: 'Minimum capability_query guard hits',
|
||||||
|
pass: false,
|
||||||
|
actual: '0',
|
||||||
|
}),
|
||||||
|
expect.objectContaining({
|
||||||
|
criterion: 'Minimum attachments_present guard hits',
|
||||||
|
pass: true,
|
||||||
|
actual: '1',
|
||||||
|
}),
|
||||||
|
]);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -81,6 +81,9 @@ export interface BackendCanaryGateThresholds {
|
|||||||
minTargetRoutes?: number;
|
minTargetRoutes?: number;
|
||||||
minBaselineRoutes?: number;
|
minBaselineRoutes?: number;
|
||||||
minTargetAttempts?: number;
|
minTargetAttempts?: number;
|
||||||
|
minGuardPiNoToolsCount?: number;
|
||||||
|
minGuardCapabilityQueryCount?: number;
|
||||||
|
minGuardAttachmentsPresentCount?: number;
|
||||||
maxCompletionRateDropPp?: number;
|
maxCompletionRateDropPp?: number;
|
||||||
maxP50LatencyIncreaseMs?: number;
|
maxP50LatencyIncreaseMs?: number;
|
||||||
maxP95LatencyIncreaseMs?: number;
|
maxP95LatencyIncreaseMs?: number;
|
||||||
@@ -513,6 +516,36 @@ export function evaluateBackendCanaryGate(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (typeof thresholds.minGuardPiNoToolsCount === 'number') {
|
||||||
|
const count = summary.route_stats.forced_native_guards.pi_no_tools_mode ?? 0;
|
||||||
|
criteria.push({
|
||||||
|
criterion: 'Minimum pi_no_tools_mode guard hits',
|
||||||
|
pass: count >= thresholds.minGuardPiNoToolsCount,
|
||||||
|
actual: `${count}`,
|
||||||
|
threshold: `>= ${thresholds.minGuardPiNoToolsCount}`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (typeof thresholds.minGuardCapabilityQueryCount === 'number') {
|
||||||
|
const count = summary.route_stats.forced_native_guards.capability_query ?? 0;
|
||||||
|
criteria.push({
|
||||||
|
criterion: 'Minimum capability_query guard hits',
|
||||||
|
pass: count >= thresholds.minGuardCapabilityQueryCount,
|
||||||
|
actual: `${count}`,
|
||||||
|
threshold: `>= ${thresholds.minGuardCapabilityQueryCount}`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (typeof thresholds.minGuardAttachmentsPresentCount === 'number') {
|
||||||
|
const count = summary.route_stats.forced_native_guards.attachments_present ?? 0;
|
||||||
|
criteria.push({
|
||||||
|
criterion: 'Minimum attachments_present guard hits',
|
||||||
|
pass: count >= thresholds.minGuardAttachmentsPresentCount,
|
||||||
|
actual: `${count}`,
|
||||||
|
threshold: `>= ${thresholds.minGuardAttachmentsPresentCount}`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
if (typeof thresholds.maxCompletionRateDropPp === 'number') {
|
if (typeof thresholds.maxCompletionRateDropPp === 'number') {
|
||||||
const delta = summary.comparison.completion_rate_delta_pp;
|
const delta = summary.comparison.completion_rate_delta_pp;
|
||||||
const pass = delta !== null && delta >= (-thresholds.maxCompletionRateDropPp);
|
const pass = delta !== null && delta >= (-thresholds.maxCompletionRateDropPp);
|
||||||
@@ -600,6 +633,20 @@ export function renderBackendCanaryMarkdown(
|
|||||||
}
|
}
|
||||||
lines.push('');
|
lines.push('');
|
||||||
|
|
||||||
|
lines.push('### Forced Native Guards');
|
||||||
|
lines.push('');
|
||||||
|
lines.push('| Guard reason | Count |');
|
||||||
|
lines.push('| --- | ---: |');
|
||||||
|
const guardRows = Object.entries(summary.route_stats.forced_native_guards)
|
||||||
|
.sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]));
|
||||||
|
for (const [reason, count] of guardRows) {
|
||||||
|
lines.push(`| ${reason} | ${count} |`);
|
||||||
|
}
|
||||||
|
if (guardRows.length === 0) {
|
||||||
|
lines.push('| _none_ | 0 |');
|
||||||
|
}
|
||||||
|
lines.push('');
|
||||||
|
|
||||||
lines.push('## Reliability');
|
lines.push('## Reliability');
|
||||||
lines.push('');
|
lines.push('');
|
||||||
lines.push('| Metric | Target | Baseline | Delta |');
|
lines.push('| Metric | Target | Baseline | Delta |');
|
||||||
|
|||||||
Reference in New Issue
Block a user