feat(audit): add sample-size gate thresholds for canary evaluation
This commit is contained in:
@@ -149,6 +149,63 @@ describe('summarizeBackendCanary', () => {
|
||||
expect(summary.target.routes).toBe(1);
|
||||
expect(summary.baseline.routes).toBe(0);
|
||||
});
|
||||
|
||||
it('classifies pi-specific fallback reasons into stable categories', () => {
|
||||
const events: AuditEvent[] = [
|
||||
makeEvent(1000, 'backend.route', {
|
||||
session_id: 'telegram:canary',
|
||||
channel: 'telegram',
|
||||
sender: '8367012007',
|
||||
selected_backend: 'pi_embedded',
|
||||
source: 'agent_override',
|
||||
}),
|
||||
makeEvent(1100, 'backend.fallback', {
|
||||
session_id: 'telegram:canary',
|
||||
channel: 'telegram',
|
||||
sender: '8367012007',
|
||||
from_backend: 'pi_embedded',
|
||||
to_backend: 'native',
|
||||
reason: 'Loaded Pi module does not expose a supported session factory (expected one of: createAgentSession)',
|
||||
duration_ms: 100,
|
||||
}),
|
||||
makeEvent(1200, 'session.message', {
|
||||
session_id: 'telegram:canary',
|
||||
role: 'assistant',
|
||||
content_length: 20,
|
||||
}),
|
||||
makeEvent(1300, 'backend.route', {
|
||||
session_id: 'telegram:canary',
|
||||
channel: 'telegram',
|
||||
sender: '8367012007',
|
||||
selected_backend: 'pi_embedded',
|
||||
source: 'agent_override',
|
||||
}),
|
||||
makeEvent(1400, 'backend.fallback', {
|
||||
session_id: 'telegram:canary',
|
||||
channel: 'telegram',
|
||||
sender: '8367012007',
|
||||
from_backend: 'pi_embedded',
|
||||
to_backend: 'native',
|
||||
reason: 'Pi Agent runtime produced no assistant text',
|
||||
duration_ms: 120,
|
||||
}),
|
||||
makeEvent(1500, 'session.message', {
|
||||
session_id: 'telegram:canary',
|
||||
role: 'assistant',
|
||||
content_length: 20,
|
||||
}),
|
||||
];
|
||||
|
||||
const summary = summarizeBackendCanary(events, {
|
||||
targetBackend: 'pi_embedded',
|
||||
baselineBackend: 'native',
|
||||
});
|
||||
|
||||
expect(summary.fallback_categories).toEqual([
|
||||
{ category: 'empty_assistant_text', count: 1, pct: 50 },
|
||||
{ category: 'pi_module_interface', count: 1, pct: 50 },
|
||||
]);
|
||||
});
|
||||
});
|
||||
|
||||
describe('evaluateBackendCanaryGate', () => {
|
||||
@@ -210,4 +267,59 @@ describe('evaluateBackendCanaryGate', () => {
|
||||
expect(markdown).toContain('Pi Embedded Canary Summary');
|
||||
expect(markdown).toContain('Gate result: PASS');
|
||||
});
|
||||
|
||||
it('fails gate when minimum sample thresholds are not met', () => {
|
||||
const events: AuditEvent[] = [
|
||||
makeEvent(1000, 'backend.route', {
|
||||
session_id: 's1',
|
||||
channel: 'telegram',
|
||||
sender: '1',
|
||||
selected_backend: 'pi_embedded',
|
||||
source: 'agent_override',
|
||||
}),
|
||||
makeEvent(1100, 'backend.success', {
|
||||
session_id: 's1',
|
||||
channel: 'telegram',
|
||||
sender: '1',
|
||||
backend: 'pi_embedded',
|
||||
duration_ms: 100,
|
||||
response_length: 8,
|
||||
}),
|
||||
makeEvent(1200, 'session.message', {
|
||||
session_id: 's1',
|
||||
role: 'assistant',
|
||||
content_length: 20,
|
||||
}),
|
||||
];
|
||||
|
||||
const summary = summarizeBackendCanary(events, {
|
||||
targetBackend: 'pi_embedded',
|
||||
baselineBackend: 'native',
|
||||
});
|
||||
|
||||
const gate = evaluateBackendCanaryGate(summary, {
|
||||
minTargetRoutes: 3,
|
||||
minBaselineRoutes: 2,
|
||||
minTargetAttempts: 3,
|
||||
maxFallbackRatePct: 5,
|
||||
});
|
||||
|
||||
expect(gate.pass).toBe(false);
|
||||
expect(gate.criteria).toHaveLength(4);
|
||||
expect(gate.criteria[0]).toEqual(expect.objectContaining({
|
||||
criterion: 'Minimum target routes',
|
||||
pass: false,
|
||||
actual: '1',
|
||||
}));
|
||||
expect(gate.criteria[1]).toEqual(expect.objectContaining({
|
||||
criterion: 'Minimum baseline routes',
|
||||
pass: false,
|
||||
actual: '0',
|
||||
}));
|
||||
expect(gate.criteria[2]).toEqual(expect.objectContaining({
|
||||
criterion: 'Minimum target external attempts',
|
||||
pass: false,
|
||||
actual: '1',
|
||||
}));
|
||||
});
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user