433 lines
13 KiB
TypeScript
433 lines
13 KiB
TypeScript
import { describe, expect, it } from 'vitest';
|
|
import type { AuditEvent } from './types.js';
|
|
import {
|
|
evaluateBackendCanaryGate,
|
|
renderBackendCanaryMarkdown,
|
|
summarizeBackendCanary,
|
|
} from './backendCanarySummary.js';
|
|
|
|
function makeEvent(
|
|
timestamp: number,
|
|
event_type: AuditEvent['event_type'],
|
|
event: Record<string, unknown>,
|
|
): AuditEvent {
|
|
return {
|
|
timestamp,
|
|
level: 'info',
|
|
event_type,
|
|
event,
|
|
};
|
|
}
|
|
|
|
describe('summarizeBackendCanary', () => {
|
|
it('computes route, reliability, latency, and fallback summaries', () => {
|
|
const events: AuditEvent[] = [
|
|
makeEvent(1000, 'backend.route', {
|
|
session_id: 'telegram:canary',
|
|
channel: 'telegram',
|
|
sender: '8367012007',
|
|
selected_backend: 'pi_embedded',
|
|
source: 'agent_override',
|
|
}),
|
|
makeEvent(1120, 'backend.success', {
|
|
session_id: 'telegram:canary',
|
|
channel: 'telegram',
|
|
sender: '8367012007',
|
|
backend: 'pi_embedded',
|
|
duration_ms: 120,
|
|
response_length: 50,
|
|
}),
|
|
makeEvent(1140, 'session.message', {
|
|
session_id: 'telegram:canary',
|
|
role: 'assistant',
|
|
content_length: 50,
|
|
}),
|
|
makeEvent(2000, 'backend.route', {
|
|
session_id: 'telegram:canary',
|
|
channel: 'telegram',
|
|
sender: '8367012007',
|
|
selected_backend: 'pi_embedded',
|
|
source: 'agent_override',
|
|
}),
|
|
makeEvent(2300, 'backend.fallback', {
|
|
session_id: 'telegram:canary',
|
|
channel: 'telegram',
|
|
sender: '8367012007',
|
|
from_backend: 'pi_embedded',
|
|
to_backend: 'native',
|
|
reason: 'request timed out waiting for backend process',
|
|
duration_ms: 300,
|
|
}),
|
|
makeEvent(2340, 'session.message', {
|
|
session_id: 'telegram:canary',
|
|
role: 'assistant',
|
|
content_length: 80,
|
|
}),
|
|
makeEvent(3000, 'backend.route', {
|
|
session_id: 'telegram:control',
|
|
channel: 'telegram',
|
|
sender: '123',
|
|
selected_backend: 'native',
|
|
source: 'native',
|
|
}),
|
|
makeEvent(3080, 'session.message', {
|
|
session_id: 'telegram:control',
|
|
role: 'assistant',
|
|
content_length: 25,
|
|
}),
|
|
];
|
|
|
|
const summary = summarizeBackendCanary(events, {
|
|
targetBackend: 'pi_embedded',
|
|
baselineBackend: 'native',
|
|
});
|
|
|
|
expect(summary.route_stats.total).toBe(3);
|
|
expect(summary.route_stats.by_backend.pi_embedded).toBe(2);
|
|
expect(summary.route_stats.by_backend.native).toBe(1);
|
|
|
|
expect(summary.target.routes).toBe(2);
|
|
expect(summary.target.completed_turns).toBe(2);
|
|
expect(summary.target.completion_rate_pct).toBe(100);
|
|
expect(summary.target.e2e_latency_ms?.p50_ms).toBe(240);
|
|
expect(summary.target.e2e_latency_ms?.p95_ms).toBe(330);
|
|
|
|
expect(summary.baseline.routes).toBe(1);
|
|
expect(summary.baseline.completion_rate_pct).toBe(100);
|
|
expect(summary.baseline.e2e_latency_ms?.p50_ms).toBe(80);
|
|
|
|
expect(summary.target_external_attempts?.attempts).toBe(2);
|
|
expect(summary.target_external_attempts?.successes).toBe(1);
|
|
expect(summary.target_external_attempts?.fallbacks).toBe(1);
|
|
expect(summary.target_external_attempts?.success_rate_pct).toBe(50);
|
|
expect(summary.target_external_attempts?.attempt_latency_ms?.p50_ms).toBe(210);
|
|
|
|
expect(summary.comparison.p50_latency_delta_ms).toBe(160);
|
|
expect(summary.comparison.p95_latency_delta_ms).toBe(250);
|
|
|
|
expect(summary.fallback_categories).toEqual([
|
|
{ category: 'timeout', count: 1, pct: 100 },
|
|
]);
|
|
expect(summary.fallback_top_reasons[0]?.reason).toContain('request timed out');
|
|
});
|
|
|
|
it('filters routes by session id', () => {
|
|
const events: AuditEvent[] = [
|
|
makeEvent(1000, 'backend.route', {
|
|
session_id: 'telegram:canary',
|
|
channel: 'telegram',
|
|
sender: '8367012007',
|
|
selected_backend: 'pi_embedded',
|
|
source: 'agent_override',
|
|
}),
|
|
makeEvent(1100, 'session.message', {
|
|
session_id: 'telegram:canary',
|
|
role: 'assistant',
|
|
content_length: 10,
|
|
}),
|
|
makeEvent(2000, 'backend.route', {
|
|
session_id: 'telegram:other',
|
|
channel: 'telegram',
|
|
sender: '9999',
|
|
selected_backend: 'native',
|
|
source: 'native',
|
|
}),
|
|
makeEvent(2100, 'session.message', {
|
|
session_id: 'telegram:other',
|
|
role: 'assistant',
|
|
content_length: 10,
|
|
}),
|
|
];
|
|
|
|
const summary = summarizeBackendCanary(events, {
|
|
targetBackend: 'pi_embedded',
|
|
baselineBackend: 'native',
|
|
sessionIds: ['telegram:canary'],
|
|
});
|
|
|
|
expect(summary.route_stats.total).toBe(1);
|
|
expect(summary.target.routes).toBe(1);
|
|
expect(summary.baseline.routes).toBe(0);
|
|
});
|
|
|
|
it('classifies pi-specific fallback reasons into stable categories', () => {
|
|
const events: AuditEvent[] = [
|
|
makeEvent(1000, 'backend.route', {
|
|
session_id: 'telegram:canary',
|
|
channel: 'telegram',
|
|
sender: '8367012007',
|
|
selected_backend: 'pi_embedded',
|
|
source: 'agent_override',
|
|
}),
|
|
makeEvent(1100, 'backend.fallback', {
|
|
session_id: 'telegram:canary',
|
|
channel: 'telegram',
|
|
sender: '8367012007',
|
|
from_backend: 'pi_embedded',
|
|
to_backend: 'native',
|
|
reason: 'Loaded Pi module does not expose a supported session factory (expected one of: createAgentSession)',
|
|
duration_ms: 100,
|
|
}),
|
|
makeEvent(1200, 'session.message', {
|
|
session_id: 'telegram:canary',
|
|
role: 'assistant',
|
|
content_length: 20,
|
|
}),
|
|
makeEvent(1300, 'backend.route', {
|
|
session_id: 'telegram:canary',
|
|
channel: 'telegram',
|
|
sender: '8367012007',
|
|
selected_backend: 'pi_embedded',
|
|
source: 'agent_override',
|
|
}),
|
|
makeEvent(1400, 'backend.fallback', {
|
|
session_id: 'telegram:canary',
|
|
channel: 'telegram',
|
|
sender: '8367012007',
|
|
from_backend: 'pi_embedded',
|
|
to_backend: 'native',
|
|
reason: 'Pi Agent runtime produced no assistant text',
|
|
duration_ms: 120,
|
|
}),
|
|
makeEvent(1500, 'session.message', {
|
|
session_id: 'telegram:canary',
|
|
role: 'assistant',
|
|
content_length: 20,
|
|
}),
|
|
];
|
|
|
|
const summary = summarizeBackendCanary(events, {
|
|
targetBackend: 'pi_embedded',
|
|
baselineBackend: 'native',
|
|
});
|
|
|
|
expect(summary.fallback_categories).toEqual([
|
|
{ category: 'empty_assistant_text', count: 1, pct: 50 },
|
|
{ category: 'pi_module_interface', count: 1, pct: 50 },
|
|
]);
|
|
});
|
|
|
|
it('tracks forced-native guard reasons', () => {
|
|
const events: AuditEvent[] = [
|
|
makeEvent(1000, 'backend.route', {
|
|
session_id: 's1',
|
|
channel: 'telegram',
|
|
sender: '1',
|
|
selected_backend: 'native',
|
|
source: 'forced_native_guard',
|
|
guard_reason: 'pi_no_tools_mode',
|
|
}),
|
|
makeEvent(1010, 'session.message', {
|
|
session_id: 's1',
|
|
role: 'assistant',
|
|
content_length: 10,
|
|
}),
|
|
makeEvent(2000, 'backend.route', {
|
|
session_id: 's2',
|
|
channel: 'telegram',
|
|
sender: '2',
|
|
selected_backend: 'native',
|
|
source: 'forced_native_guard',
|
|
guard_reason: 'capability_query',
|
|
}),
|
|
makeEvent(2010, 'session.message', {
|
|
session_id: 's2',
|
|
role: 'assistant',
|
|
content_length: 10,
|
|
}),
|
|
];
|
|
|
|
const summary = summarizeBackendCanary(events, {
|
|
targetBackend: 'pi_embedded',
|
|
baselineBackend: 'native',
|
|
});
|
|
|
|
expect(summary.route_stats.forced_native_guards.pi_no_tools_mode).toBe(1);
|
|
expect(summary.route_stats.forced_native_guards.capability_query).toBe(1);
|
|
|
|
const markdown = renderBackendCanaryMarkdown(summary, {
|
|
targetBackend: 'pi_embedded',
|
|
baselineBackend: 'native',
|
|
});
|
|
expect(markdown).toContain('Forced Native Guards');
|
|
expect(markdown).toContain('pi_no_tools_mode');
|
|
});
|
|
});
|
|
|
|
describe('evaluateBackendCanaryGate', () => {
|
|
it('evaluates configured pass/fail thresholds', () => {
|
|
const events: AuditEvent[] = [
|
|
makeEvent(1000, 'backend.route', {
|
|
session_id: 's1',
|
|
channel: 'telegram',
|
|
sender: '1',
|
|
selected_backend: 'pi_embedded',
|
|
source: 'agent_override',
|
|
}),
|
|
makeEvent(1200, 'backend.success', {
|
|
session_id: 's1',
|
|
channel: 'telegram',
|
|
sender: '1',
|
|
backend: 'pi_embedded',
|
|
duration_ms: 200,
|
|
response_length: 10,
|
|
}),
|
|
makeEvent(1250, 'session.message', {
|
|
session_id: 's1',
|
|
role: 'assistant',
|
|
content_length: 20,
|
|
}),
|
|
makeEvent(2000, 'backend.route', {
|
|
session_id: 's2',
|
|
channel: 'telegram',
|
|
sender: '2',
|
|
selected_backend: 'native',
|
|
source: 'native',
|
|
}),
|
|
makeEvent(2050, 'session.message', {
|
|
session_id: 's2',
|
|
role: 'assistant',
|
|
content_length: 20,
|
|
}),
|
|
];
|
|
|
|
const summary = summarizeBackendCanary(events, {
|
|
targetBackend: 'pi_embedded',
|
|
baselineBackend: 'native',
|
|
});
|
|
|
|
const gate = evaluateBackendCanaryGate(summary, {
|
|
maxCompletionRateDropPp: 0,
|
|
maxP50LatencyIncreaseMs: 300,
|
|
maxP95LatencyIncreaseMs: 300,
|
|
maxFallbackRatePct: 5,
|
|
});
|
|
|
|
expect(gate.pass).toBe(true);
|
|
expect(gate.criteria).toHaveLength(4);
|
|
|
|
const markdown = renderBackendCanaryMarkdown(summary, {
|
|
targetBackend: 'pi_embedded',
|
|
baselineBackend: 'native',
|
|
}, gate);
|
|
expect(markdown).toContain('Pi Embedded Canary Summary');
|
|
expect(markdown).toContain('Gate result: PASS');
|
|
});
|
|
|
|
it('fails gate when minimum sample thresholds are not met', () => {
|
|
const events: AuditEvent[] = [
|
|
makeEvent(1000, 'backend.route', {
|
|
session_id: 's1',
|
|
channel: 'telegram',
|
|
sender: '1',
|
|
selected_backend: 'pi_embedded',
|
|
source: 'agent_override',
|
|
}),
|
|
makeEvent(1100, 'backend.success', {
|
|
session_id: 's1',
|
|
channel: 'telegram',
|
|
sender: '1',
|
|
backend: 'pi_embedded',
|
|
duration_ms: 100,
|
|
response_length: 8,
|
|
}),
|
|
makeEvent(1200, 'session.message', {
|
|
session_id: 's1',
|
|
role: 'assistant',
|
|
content_length: 20,
|
|
}),
|
|
];
|
|
|
|
const summary = summarizeBackendCanary(events, {
|
|
targetBackend: 'pi_embedded',
|
|
baselineBackend: 'native',
|
|
});
|
|
|
|
const gate = evaluateBackendCanaryGate(summary, {
|
|
minTargetRoutes: 3,
|
|
minBaselineRoutes: 2,
|
|
minTargetAttempts: 3,
|
|
maxFallbackRatePct: 5,
|
|
});
|
|
|
|
expect(gate.pass).toBe(false);
|
|
expect(gate.criteria).toHaveLength(4);
|
|
expect(gate.criteria[0]).toEqual(expect.objectContaining({
|
|
criterion: 'Minimum target routes',
|
|
pass: false,
|
|
actual: '1',
|
|
}));
|
|
expect(gate.criteria[1]).toEqual(expect.objectContaining({
|
|
criterion: 'Minimum baseline routes',
|
|
pass: false,
|
|
actual: '0',
|
|
}));
|
|
expect(gate.criteria[2]).toEqual(expect.objectContaining({
|
|
criterion: 'Minimum target external attempts',
|
|
pass: false,
|
|
actual: '1',
|
|
}));
|
|
});
|
|
|
|
it('evaluates guard coverage thresholds', () => {
|
|
const events: AuditEvent[] = [
|
|
makeEvent(1000, 'backend.route', {
|
|
session_id: 's1',
|
|
channel: 'telegram',
|
|
sender: '1',
|
|
selected_backend: 'native',
|
|
source: 'forced_native_guard',
|
|
guard_reason: 'pi_no_tools_mode',
|
|
}),
|
|
makeEvent(1010, 'session.message', {
|
|
session_id: 's1',
|
|
role: 'assistant',
|
|
content_length: 20,
|
|
}),
|
|
makeEvent(2000, 'backend.route', {
|
|
session_id: 's2',
|
|
channel: 'telegram',
|
|
sender: '2',
|
|
selected_backend: 'native',
|
|
source: 'forced_native_guard',
|
|
guard_reason: 'attachments_present',
|
|
}),
|
|
makeEvent(2010, 'session.message', {
|
|
session_id: 's2',
|
|
role: 'assistant',
|
|
content_length: 20,
|
|
}),
|
|
];
|
|
|
|
const summary = summarizeBackendCanary(events, {
|
|
targetBackend: 'pi_embedded',
|
|
baselineBackend: 'native',
|
|
});
|
|
|
|
const gate = evaluateBackendCanaryGate(summary, {
|
|
minGuardPiNoToolsCount: 1,
|
|
minGuardCapabilityQueryCount: 1,
|
|
minGuardAttachmentsPresentCount: 1,
|
|
});
|
|
|
|
expect(gate.pass).toBe(false);
|
|
expect(gate.criteria).toEqual([
|
|
expect.objectContaining({
|
|
criterion: 'Minimum pi_no_tools_mode guard hits',
|
|
pass: true,
|
|
actual: '1',
|
|
}),
|
|
expect.objectContaining({
|
|
criterion: 'Minimum capability_query guard hits',
|
|
pass: false,
|
|
actual: '0',
|
|
}),
|
|
expect.objectContaining({
|
|
criterion: 'Minimum attachments_present guard hits',
|
|
pass: true,
|
|
actual: '1',
|
|
}),
|
|
]);
|
|
});
|
|
});
|