feat(audit): add pi canary summary analyzer and cli script
This commit is contained in:
+2
-1
@@ -19,7 +19,8 @@
|
||||
"lint": "eslint src/",
|
||||
"typecheck": "tsc --noEmit",
|
||||
"config:profiles:generate": "node scripts/generate-config-profiles.mjs",
|
||||
"config:profiles:check": "node scripts/generate-config-profiles.mjs --check"
|
||||
"config:profiles:check": "node scripts/generate-config-profiles.mjs --check",
|
||||
"audit:backend-canary": "node --import tsx/esm scripts/summarize-backend-canary.ts"
|
||||
},
|
||||
"keywords": [
|
||||
"ai",
|
||||
|
||||
Executable
+208
@@ -0,0 +1,208 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
import { writeFile } from 'node:fs/promises';
|
||||
import { parseArgs } from 'node:util';
|
||||
import { queryAuditLogs } from '../src/audit/export.js';
|
||||
import {
|
||||
evaluateBackendCanaryGate,
|
||||
renderBackendCanaryMarkdown,
|
||||
summarizeBackendCanary,
|
||||
type BackendCanaryGateThresholds,
|
||||
type BackendCanarySummaryOptions,
|
||||
type BackendRouteSource,
|
||||
type RoutedBackendName,
|
||||
} from '../src/audit/backendCanarySummary.js';
|
||||
|
||||
const DEFAULT_EVENT_TYPES = ['backend.route', 'backend.success', 'backend.fallback', 'session.message'] as const;
|
||||
|
||||
function usage(): string {
|
||||
return [
|
||||
'Usage: node --import tsx/esm scripts/summarize-backend-canary.ts --audit <path> [options]',
|
||||
'',
|
||||
'Options:',
|
||||
' --audit <path> Path to audit.log (required)',
|
||||
' --backend <name> Target backend (default: pi_embedded)',
|
||||
' --baseline <name> Baseline backend (default: native)',
|
||||
' --since <ISO-8601|epoch_ms> Start time filter',
|
||||
' --until <ISO-8601|epoch_ms> End time filter',
|
||||
' --session <id[,id...]> Restrict to session IDs',
|
||||
' --channel <name[,name...]> Restrict to channels',
|
||||
' --sender <id[,id...]> Restrict to senders',
|
||||
' --source <name[,name...]> Restrict route sources (agent_override,default_external,native,forced_native_guard)',
|
||||
' --format <markdown|json> Output format (default: markdown)',
|
||||
' --out <path> Write output to file instead of stdout',
|
||||
'',
|
||||
'Gate options (optional):',
|
||||
' --gate-max-completion-drop-pp <number>',
|
||||
' --gate-max-p50-latency-increase-ms <number>',
|
||||
' --gate-max-p95-latency-increase-ms <number>',
|
||||
' --gate-max-fallback-rate-pct <number>',
|
||||
].join('\n');
|
||||
}
|
||||
|
||||
function parseTime(value: string | undefined, flag: string): number | undefined {
|
||||
if (!value) {
|
||||
return undefined;
|
||||
}
|
||||
if (/^\d+$/.test(value)) {
|
||||
const asNumber = Number(value);
|
||||
if (Number.isFinite(asNumber)) {
|
||||
return asNumber;
|
||||
}
|
||||
}
|
||||
const parsed = Date.parse(value);
|
||||
if (!Number.isFinite(parsed)) {
|
||||
throw new Error(`Invalid ${flag} value "${value}". Use ISO-8601 or epoch milliseconds.`);
|
||||
}
|
||||
return parsed;
|
||||
}
|
||||
|
||||
function parseCsv(value: string | undefined): string[] | undefined {
|
||||
if (!value) {
|
||||
return undefined;
|
||||
}
|
||||
const values = value
|
||||
.split(',')
|
||||
.map((item) => item.trim())
|
||||
.filter((item) => item.length > 0);
|
||||
return values.length > 0 ? values : undefined;
|
||||
}
|
||||
|
||||
function parseOptionalNumber(raw: string | undefined, flag: string): number | undefined {
|
||||
if (!raw) {
|
||||
return undefined;
|
||||
}
|
||||
const parsed = Number(raw);
|
||||
if (!Number.isFinite(parsed)) {
|
||||
throw new Error(`Invalid ${flag} value "${raw}". Expected a number.`);
|
||||
}
|
||||
return parsed;
|
||||
}
|
||||
|
||||
function parseBackendName(raw: string | undefined, fallback: RoutedBackendName): RoutedBackendName {
|
||||
const value = (raw ?? fallback).trim() as RoutedBackendName;
|
||||
if (
|
||||
value === 'native'
|
||||
|| value === 'claude_code'
|
||||
|| value === 'opencode'
|
||||
|| value === 'codex'
|
||||
|| value === 'gemini'
|
||||
|| value === 'pi_embedded'
|
||||
) {
|
||||
return value;
|
||||
}
|
||||
throw new Error(`Invalid backend "${value}".`);
|
||||
}
|
||||
|
||||
function parseSources(raw: string | undefined): BackendRouteSource[] | undefined {
|
||||
const values = parseCsv(raw);
|
||||
if (!values) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
const parsed: BackendRouteSource[] = [];
|
||||
for (const value of values) {
|
||||
if (value === 'agent_override' || value === 'default_external' || value === 'native' || value === 'forced_native_guard') {
|
||||
parsed.push(value);
|
||||
continue;
|
||||
}
|
||||
throw new Error(`Invalid source "${value}".`);
|
||||
}
|
||||
|
||||
return parsed;
|
||||
}
|
||||
|
||||
async function main(): Promise<void> {
|
||||
const { values } = parseArgs({
|
||||
options: {
|
||||
audit: { type: 'string' },
|
||||
backend: { type: 'string' },
|
||||
baseline: { type: 'string' },
|
||||
since: { type: 'string' },
|
||||
until: { type: 'string' },
|
||||
session: { type: 'string' },
|
||||
channel: { type: 'string' },
|
||||
sender: { type: 'string' },
|
||||
source: { type: 'string' },
|
||||
format: { type: 'string' },
|
||||
out: { type: 'string' },
|
||||
'gate-max-completion-drop-pp': { type: 'string' },
|
||||
'gate-max-p50-latency-increase-ms': { type: 'string' },
|
||||
'gate-max-p95-latency-increase-ms': { type: 'string' },
|
||||
'gate-max-fallback-rate-pct': { type: 'string' },
|
||||
help: { type: 'boolean', short: 'h' },
|
||||
},
|
||||
strict: true,
|
||||
allowPositionals: false,
|
||||
});
|
||||
|
||||
if (values.help) {
|
||||
process.stdout.write(`${usage()}\n`);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!values.audit) {
|
||||
throw new Error('--audit is required.');
|
||||
}
|
||||
|
||||
const format = values.format ?? 'markdown';
|
||||
if (format !== 'markdown' && format !== 'json') {
|
||||
throw new Error(`Invalid --format value "${format}".`);
|
||||
}
|
||||
|
||||
const summaryOptions: BackendCanarySummaryOptions = {
|
||||
targetBackend: parseBackendName(values.backend, 'pi_embedded'),
|
||||
baselineBackend: parseBackendName(values.baseline, 'native'),
|
||||
sessionIds: parseCsv(values.session),
|
||||
channels: parseCsv(values.channel),
|
||||
senders: parseCsv(values.sender),
|
||||
routeSources: parseSources(values.source),
|
||||
};
|
||||
|
||||
const startTime = parseTime(values.since, '--since');
|
||||
const endTime = parseTime(values.until, '--until');
|
||||
|
||||
const events = await queryAuditLogs(values.audit, {
|
||||
start_time: startTime,
|
||||
end_time: endTime,
|
||||
event_types: [...DEFAULT_EVENT_TYPES],
|
||||
});
|
||||
|
||||
const summary = summarizeBackendCanary(events, summaryOptions);
|
||||
|
||||
const gateThresholds: BackendCanaryGateThresholds = {
|
||||
maxCompletionRateDropPp: parseOptionalNumber(values['gate-max-completion-drop-pp'], '--gate-max-completion-drop-pp'),
|
||||
maxP50LatencyIncreaseMs: parseOptionalNumber(values['gate-max-p50-latency-increase-ms'], '--gate-max-p50-latency-increase-ms'),
|
||||
maxP95LatencyIncreaseMs: parseOptionalNumber(values['gate-max-p95-latency-increase-ms'], '--gate-max-p95-latency-increase-ms'),
|
||||
maxFallbackRatePct: parseOptionalNumber(values['gate-max-fallback-rate-pct'], '--gate-max-fallback-rate-pct'),
|
||||
};
|
||||
|
||||
const hasGateThreshold = Object.values(gateThresholds).some((value) => typeof value === 'number');
|
||||
const gateResult = hasGateThreshold ? evaluateBackendCanaryGate(summary, gateThresholds) : undefined;
|
||||
|
||||
const output = format === 'json'
|
||||
? JSON.stringify({
|
||||
generated_at: new Date().toISOString(),
|
||||
event_count: events.length,
|
||||
filters: {
|
||||
since_ms: startTime,
|
||||
until_ms: endTime,
|
||||
},
|
||||
options: summaryOptions,
|
||||
summary,
|
||||
gate: gateResult,
|
||||
}, null, 2)
|
||||
: renderBackendCanaryMarkdown(summary, summaryOptions, gateResult);
|
||||
|
||||
if (values.out) {
|
||||
await writeFile(values.out, `${output}\n`, 'utf-8');
|
||||
} else {
|
||||
process.stdout.write(`${output}\n`);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
process.stderr.write(`${message}\n\n${usage()}\n`);
|
||||
process.exitCode = 1;
|
||||
});
|
||||
@@ -0,0 +1,213 @@
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import type { AuditEvent } from './types.js';
|
||||
import {
|
||||
evaluateBackendCanaryGate,
|
||||
renderBackendCanaryMarkdown,
|
||||
summarizeBackendCanary,
|
||||
} from './backendCanarySummary.js';
|
||||
|
||||
function makeEvent(
|
||||
timestamp: number,
|
||||
event_type: AuditEvent['event_type'],
|
||||
event: Record<string, unknown>,
|
||||
): AuditEvent {
|
||||
return {
|
||||
timestamp,
|
||||
level: 'info',
|
||||
event_type,
|
||||
event,
|
||||
};
|
||||
}
|
||||
|
||||
describe('summarizeBackendCanary', () => {
|
||||
it('computes route, reliability, latency, and fallback summaries', () => {
|
||||
const events: AuditEvent[] = [
|
||||
makeEvent(1000, 'backend.route', {
|
||||
session_id: 'telegram:canary',
|
||||
channel: 'telegram',
|
||||
sender: '8367012007',
|
||||
selected_backend: 'pi_embedded',
|
||||
source: 'agent_override',
|
||||
}),
|
||||
makeEvent(1120, 'backend.success', {
|
||||
session_id: 'telegram:canary',
|
||||
channel: 'telegram',
|
||||
sender: '8367012007',
|
||||
backend: 'pi_embedded',
|
||||
duration_ms: 120,
|
||||
response_length: 50,
|
||||
}),
|
||||
makeEvent(1140, 'session.message', {
|
||||
session_id: 'telegram:canary',
|
||||
role: 'assistant',
|
||||
content_length: 50,
|
||||
}),
|
||||
makeEvent(2000, 'backend.route', {
|
||||
session_id: 'telegram:canary',
|
||||
channel: 'telegram',
|
||||
sender: '8367012007',
|
||||
selected_backend: 'pi_embedded',
|
||||
source: 'agent_override',
|
||||
}),
|
||||
makeEvent(2300, 'backend.fallback', {
|
||||
session_id: 'telegram:canary',
|
||||
channel: 'telegram',
|
||||
sender: '8367012007',
|
||||
from_backend: 'pi_embedded',
|
||||
to_backend: 'native',
|
||||
reason: 'request timed out waiting for backend process',
|
||||
duration_ms: 300,
|
||||
}),
|
||||
makeEvent(2340, 'session.message', {
|
||||
session_id: 'telegram:canary',
|
||||
role: 'assistant',
|
||||
content_length: 80,
|
||||
}),
|
||||
makeEvent(3000, 'backend.route', {
|
||||
session_id: 'telegram:control',
|
||||
channel: 'telegram',
|
||||
sender: '123',
|
||||
selected_backend: 'native',
|
||||
source: 'native',
|
||||
}),
|
||||
makeEvent(3080, 'session.message', {
|
||||
session_id: 'telegram:control',
|
||||
role: 'assistant',
|
||||
content_length: 25,
|
||||
}),
|
||||
];
|
||||
|
||||
const summary = summarizeBackendCanary(events, {
|
||||
targetBackend: 'pi_embedded',
|
||||
baselineBackend: 'native',
|
||||
});
|
||||
|
||||
expect(summary.route_stats.total).toBe(3);
|
||||
expect(summary.route_stats.by_backend.pi_embedded).toBe(2);
|
||||
expect(summary.route_stats.by_backend.native).toBe(1);
|
||||
|
||||
expect(summary.target.routes).toBe(2);
|
||||
expect(summary.target.completed_turns).toBe(2);
|
||||
expect(summary.target.completion_rate_pct).toBe(100);
|
||||
expect(summary.target.e2e_latency_ms?.p50_ms).toBe(240);
|
||||
expect(summary.target.e2e_latency_ms?.p95_ms).toBe(330);
|
||||
|
||||
expect(summary.baseline.routes).toBe(1);
|
||||
expect(summary.baseline.completion_rate_pct).toBe(100);
|
||||
expect(summary.baseline.e2e_latency_ms?.p50_ms).toBe(80);
|
||||
|
||||
expect(summary.target_external_attempts?.attempts).toBe(2);
|
||||
expect(summary.target_external_attempts?.successes).toBe(1);
|
||||
expect(summary.target_external_attempts?.fallbacks).toBe(1);
|
||||
expect(summary.target_external_attempts?.success_rate_pct).toBe(50);
|
||||
expect(summary.target_external_attempts?.attempt_latency_ms?.p50_ms).toBe(210);
|
||||
|
||||
expect(summary.comparison.p50_latency_delta_ms).toBe(160);
|
||||
expect(summary.comparison.p95_latency_delta_ms).toBe(250);
|
||||
|
||||
expect(summary.fallback_categories).toEqual([
|
||||
{ category: 'timeout', count: 1, pct: 100 },
|
||||
]);
|
||||
expect(summary.fallback_top_reasons[0]?.reason).toContain('request timed out');
|
||||
});
|
||||
|
||||
it('filters routes by session id', () => {
|
||||
const events: AuditEvent[] = [
|
||||
makeEvent(1000, 'backend.route', {
|
||||
session_id: 'telegram:canary',
|
||||
channel: 'telegram',
|
||||
sender: '8367012007',
|
||||
selected_backend: 'pi_embedded',
|
||||
source: 'agent_override',
|
||||
}),
|
||||
makeEvent(1100, 'session.message', {
|
||||
session_id: 'telegram:canary',
|
||||
role: 'assistant',
|
||||
content_length: 10,
|
||||
}),
|
||||
makeEvent(2000, 'backend.route', {
|
||||
session_id: 'telegram:other',
|
||||
channel: 'telegram',
|
||||
sender: '9999',
|
||||
selected_backend: 'native',
|
||||
source: 'native',
|
||||
}),
|
||||
makeEvent(2100, 'session.message', {
|
||||
session_id: 'telegram:other',
|
||||
role: 'assistant',
|
||||
content_length: 10,
|
||||
}),
|
||||
];
|
||||
|
||||
const summary = summarizeBackendCanary(events, {
|
||||
targetBackend: 'pi_embedded',
|
||||
baselineBackend: 'native',
|
||||
sessionIds: ['telegram:canary'],
|
||||
});
|
||||
|
||||
expect(summary.route_stats.total).toBe(1);
|
||||
expect(summary.target.routes).toBe(1);
|
||||
expect(summary.baseline.routes).toBe(0);
|
||||
});
|
||||
});
|
||||
|
||||
describe('evaluateBackendCanaryGate', () => {
|
||||
it('evaluates configured pass/fail thresholds', () => {
|
||||
const events: AuditEvent[] = [
|
||||
makeEvent(1000, 'backend.route', {
|
||||
session_id: 's1',
|
||||
channel: 'telegram',
|
||||
sender: '1',
|
||||
selected_backend: 'pi_embedded',
|
||||
source: 'agent_override',
|
||||
}),
|
||||
makeEvent(1200, 'backend.success', {
|
||||
session_id: 's1',
|
||||
channel: 'telegram',
|
||||
sender: '1',
|
||||
backend: 'pi_embedded',
|
||||
duration_ms: 200,
|
||||
response_length: 10,
|
||||
}),
|
||||
makeEvent(1250, 'session.message', {
|
||||
session_id: 's1',
|
||||
role: 'assistant',
|
||||
content_length: 20,
|
||||
}),
|
||||
makeEvent(2000, 'backend.route', {
|
||||
session_id: 's2',
|
||||
channel: 'telegram',
|
||||
sender: '2',
|
||||
selected_backend: 'native',
|
||||
source: 'native',
|
||||
}),
|
||||
makeEvent(2050, 'session.message', {
|
||||
session_id: 's2',
|
||||
role: 'assistant',
|
||||
content_length: 20,
|
||||
}),
|
||||
];
|
||||
|
||||
const summary = summarizeBackendCanary(events, {
|
||||
targetBackend: 'pi_embedded',
|
||||
baselineBackend: 'native',
|
||||
});
|
||||
|
||||
const gate = evaluateBackendCanaryGate(summary, {
|
||||
maxCompletionRateDropPp: 0,
|
||||
maxP50LatencyIncreaseMs: 300,
|
||||
maxP95LatencyIncreaseMs: 300,
|
||||
maxFallbackRatePct: 5,
|
||||
});
|
||||
|
||||
expect(gate.pass).toBe(true);
|
||||
expect(gate.criteria).toHaveLength(4);
|
||||
|
||||
const markdown = renderBackendCanaryMarkdown(summary, {
|
||||
targetBackend: 'pi_embedded',
|
||||
baselineBackend: 'native',
|
||||
}, gate);
|
||||
expect(markdown).toContain('Pi Embedded Canary Summary');
|
||||
expect(markdown).toContain('Gate result: PASS');
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,618 @@
|
||||
import type { ExternalBackendName } from '../backends/external.js';
|
||||
import type { AuditEvent } from './types.js';
|
||||
|
||||
export type RoutedBackendName = 'native' | ExternalBackendName;
|
||||
|
||||
export type BackendRouteSource = 'agent_override' | 'default_external' | 'native' | 'forced_native_guard';
|
||||
|
||||
export type BackendGuardReason = 'capability_query' | 'pi_no_tools_mode' | 'attachments_present';
|
||||
|
||||
export interface BackendCanarySummaryOptions {
|
||||
targetBackend: RoutedBackendName;
|
||||
baselineBackend: RoutedBackendName;
|
||||
sessionIds?: string[];
|
||||
channels?: string[];
|
||||
senders?: string[];
|
||||
routeSources?: BackendRouteSource[];
|
||||
}
|
||||
|
||||
export interface LatencyStats {
|
||||
count: number;
|
||||
avg_ms: number;
|
||||
p50_ms: number;
|
||||
p95_ms: number;
|
||||
min_ms: number;
|
||||
max_ms: number;
|
||||
}
|
||||
|
||||
export interface RouteStats {
|
||||
total: number;
|
||||
by_backend: Partial<Record<RoutedBackendName, number>>;
|
||||
by_source: Partial<Record<BackendRouteSource, number>>;
|
||||
forced_native_guards: Partial<Record<BackendGuardReason, number>>;
|
||||
}
|
||||
|
||||
export interface BackendStats {
|
||||
backend: RoutedBackendName;
|
||||
routes: number;
|
||||
completed_turns: number;
|
||||
incomplete_turns: number;
|
||||
completion_rate_pct: number | null;
|
||||
e2e_latency_ms: LatencyStats | null;
|
||||
}
|
||||
|
||||
export interface ExternalBackendAttemptStats {
|
||||
attempts: number;
|
||||
successes: number;
|
||||
fallbacks: number;
|
||||
unresolved_attempts: number;
|
||||
success_rate_pct: number | null;
|
||||
attempt_latency_ms: LatencyStats | null;
|
||||
}
|
||||
|
||||
export interface FallbackReasonStats {
|
||||
category: string;
|
||||
count: number;
|
||||
pct: number;
|
||||
}
|
||||
|
||||
export interface FallbackRawReasonStats {
|
||||
reason: string;
|
||||
count: number;
|
||||
}
|
||||
|
||||
export interface BackendComparisonStats {
|
||||
completion_rate_delta_pp: number | null;
|
||||
p50_latency_delta_ms: number | null;
|
||||
p95_latency_delta_ms: number | null;
|
||||
}
|
||||
|
||||
export interface BackendCanarySummary {
|
||||
route_stats: RouteStats;
|
||||
target: BackendStats;
|
||||
baseline: BackendStats;
|
||||
target_external_attempts: ExternalBackendAttemptStats | null;
|
||||
comparison: BackendComparisonStats;
|
||||
fallback_categories: FallbackReasonStats[];
|
||||
fallback_top_reasons: FallbackRawReasonStats[];
|
||||
}
|
||||
|
||||
export interface BackendCanaryGateThresholds {
|
||||
maxCompletionRateDropPp?: number;
|
||||
maxP50LatencyIncreaseMs?: number;
|
||||
maxP95LatencyIncreaseMs?: number;
|
||||
maxFallbackRatePct?: number;
|
||||
}
|
||||
|
||||
export interface BackendCanaryGateResult {
|
||||
pass: boolean;
|
||||
criteria: Array<{ criterion: string; pass: boolean; actual: string; threshold: string }>;
|
||||
}
|
||||
|
||||
interface RouteTurn {
|
||||
sessionId: string;
|
||||
channel: string;
|
||||
sender: string;
|
||||
selectedBackend: RoutedBackendName;
|
||||
source: BackendRouteSource;
|
||||
guardReason?: BackendGuardReason;
|
||||
routeTimestamp: number;
|
||||
externalOutcome?: 'success' | 'fallback';
|
||||
externalDurationMs?: number;
|
||||
fallbackReason?: string;
|
||||
assistantTimestamp?: number;
|
||||
}
|
||||
|
||||
function toRecord(value: unknown): Record<string, unknown> {
|
||||
return (value && typeof value === 'object') ? value as Record<string, unknown> : {};
|
||||
}
|
||||
|
||||
function readString(value: unknown): string | undefined {
|
||||
return typeof value === 'string' ? value : undefined;
|
||||
}
|
||||
|
||||
function readNumber(value: unknown): number | undefined {
|
||||
return typeof value === 'number' && Number.isFinite(value) ? value : undefined;
|
||||
}
|
||||
|
||||
function isRouteSource(value: unknown): value is BackendRouteSource {
|
||||
return value === 'agent_override' || value === 'default_external' || value === 'native' || value === 'forced_native_guard';
|
||||
}
|
||||
|
||||
function isGuardReason(value: unknown): value is BackendGuardReason {
|
||||
return value === 'capability_query' || value === 'pi_no_tools_mode' || value === 'attachments_present';
|
||||
}
|
||||
|
||||
function isRoutedBackendName(value: unknown): value is RoutedBackendName {
|
||||
return value === 'native'
|
||||
|| value === 'claude_code'
|
||||
|| value === 'opencode'
|
||||
|| value === 'codex'
|
||||
|| value === 'gemini'
|
||||
|| value === 'pi_embedded';
|
||||
}
|
||||
|
||||
function percentile(sortedAscending: number[], pct: number): number {
|
||||
if (sortedAscending.length === 0) {
|
||||
return 0;
|
||||
}
|
||||
if (sortedAscending.length === 1) {
|
||||
return sortedAscending[0];
|
||||
}
|
||||
const clampedPct = Math.max(0, Math.min(100, pct));
|
||||
const position = (clampedPct / 100) * (sortedAscending.length - 1);
|
||||
const lowerIndex = Math.floor(position);
|
||||
const upperIndex = Math.ceil(position);
|
||||
if (lowerIndex === upperIndex) {
|
||||
return sortedAscending[lowerIndex] ?? 0;
|
||||
}
|
||||
const lower = sortedAscending[lowerIndex] ?? 0;
|
||||
const upper = sortedAscending[upperIndex] ?? 0;
|
||||
const weight = position - lowerIndex;
|
||||
return lower + ((upper - lower) * weight);
|
||||
}
|
||||
|
||||
function computeLatencyStats(samples: number[]): LatencyStats | null {
|
||||
if (samples.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const sorted = [...samples].sort((a, b) => a - b);
|
||||
const total = sorted.reduce((sum, value) => sum + value, 0);
|
||||
|
||||
return {
|
||||
count: sorted.length,
|
||||
avg_ms: Math.round(total / sorted.length),
|
||||
p50_ms: Math.round(percentile(sorted, 50)),
|
||||
p95_ms: Math.round(percentile(sorted, 95)),
|
||||
min_ms: sorted[0] ?? 0,
|
||||
max_ms: sorted[sorted.length - 1] ?? 0,
|
||||
};
|
||||
}
|
||||
|
||||
function toPct(part: number, whole: number): number | null {
|
||||
if (whole <= 0) {
|
||||
return null;
|
||||
}
|
||||
return Math.round((part / whole) * 10000) / 100;
|
||||
}
|
||||
|
||||
function normalizeFallbackCategory(reason: string): string {
|
||||
const normalized = reason.trim().toLowerCase();
|
||||
if (!normalized) {
|
||||
return 'unknown';
|
||||
}
|
||||
if (normalized.includes('timeout') || normalized.includes('timed out')) {
|
||||
return 'timeout';
|
||||
}
|
||||
if (normalized.includes('abort') || normalized.includes('cancel')) {
|
||||
return 'cancelled';
|
||||
}
|
||||
if (normalized.includes('rate limit') || normalized.includes('429')) {
|
||||
return 'rate_limit';
|
||||
}
|
||||
if (
|
||||
normalized.includes('unauthorized')
|
||||
|| normalized.includes('forbidden')
|
||||
|| normalized.includes('401')
|
||||
|| normalized.includes('403')
|
||||
|| normalized.includes('api key')
|
||||
) {
|
||||
return 'auth';
|
||||
}
|
||||
if (
|
||||
normalized.includes('fetch failed')
|
||||
|| normalized.includes('network')
|
||||
|| normalized.includes('socket')
|
||||
|| normalized.includes('econn')
|
||||
|| normalized.includes('enotfound')
|
||||
|| normalized.includes('connect')
|
||||
) {
|
||||
return 'network';
|
||||
}
|
||||
if (normalized.includes('tool') || normalized.includes('capability')) {
|
||||
return 'tool_or_capability';
|
||||
}
|
||||
if (normalized.includes('json') || normalized.includes('parse') || normalized.includes('format')) {
|
||||
return 'response_format';
|
||||
}
|
||||
return normalized.slice(0, 80);
|
||||
}
|
||||
|
||||
function normalizeFallbackReasonForDisplay(reason: string): string {
|
||||
const singleLine = reason.replace(/\s+/g, ' ').trim();
|
||||
if (!singleLine) {
|
||||
return 'unknown';
|
||||
}
|
||||
return singleLine.slice(0, 140);
|
||||
}
|
||||
|
||||
function buildBackendStats(backend: RoutedBackendName, turns: RouteTurn[]): BackendStats {
|
||||
const completed = turns.filter((turn) => typeof turn.assistantTimestamp === 'number');
|
||||
const latencies = completed
|
||||
.map((turn) => (turn.assistantTimestamp as number) - turn.routeTimestamp)
|
||||
.filter((value) => value >= 0);
|
||||
|
||||
return {
|
||||
backend,
|
||||
routes: turns.length,
|
||||
completed_turns: completed.length,
|
||||
incomplete_turns: turns.length - completed.length,
|
||||
completion_rate_pct: toPct(completed.length, turns.length),
|
||||
e2e_latency_ms: computeLatencyStats(latencies),
|
||||
};
|
||||
}
|
||||
|
||||
function buildAttemptStats(turns: RouteTurn[]): ExternalBackendAttemptStats {
|
||||
const attempts = turns.filter((turn) => turn.externalOutcome !== undefined);
|
||||
const successes = attempts.filter((turn) => turn.externalOutcome === 'success');
|
||||
const fallbacks = attempts.filter((turn) => turn.externalOutcome === 'fallback');
|
||||
const unresolved = turns.length - attempts.length;
|
||||
const durationSamples = attempts
|
||||
.map((turn) => turn.externalDurationMs)
|
||||
.filter((value): value is number => typeof value === 'number' && value >= 0);
|
||||
|
||||
return {
|
||||
attempts: attempts.length,
|
||||
successes: successes.length,
|
||||
fallbacks: fallbacks.length,
|
||||
unresolved_attempts: unresolved,
|
||||
success_rate_pct: toPct(successes.length, attempts.length),
|
||||
attempt_latency_ms: computeLatencyStats(durationSamples),
|
||||
};
|
||||
}
|
||||
|
||||
function sortFallbackReasonStats(categoryCounts: Map<string, number>): FallbackReasonStats[] {
|
||||
const total = [...categoryCounts.values()].reduce((sum, value) => sum + value, 0);
|
||||
return [...categoryCounts.entries()]
|
||||
.sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]))
|
||||
.map(([category, count]) => ({
|
||||
category,
|
||||
count,
|
||||
pct: total > 0 ? Math.round((count / total) * 10000) / 100 : 0,
|
||||
}));
|
||||
}
|
||||
|
||||
function sortTopRawReasons(rawReasonCounts: Map<string, number>, limit = 10): FallbackRawReasonStats[] {
|
||||
return [...rawReasonCounts.entries()]
|
||||
.sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]))
|
||||
.slice(0, limit)
|
||||
.map(([reason, count]) => ({ reason, count }));
|
||||
}
|
||||
|
||||
export function summarizeBackendCanary(events: AuditEvent[], options: BackendCanarySummaryOptions): BackendCanarySummary {
|
||||
const sessionFilter = new Set(options.sessionIds ?? []);
|
||||
const channelFilter = new Set(options.channels ?? []);
|
||||
const senderFilter = new Set(options.senders ?? []);
|
||||
const sourceFilter = new Set(options.routeSources ?? []);
|
||||
|
||||
const routeStats: RouteStats = {
|
||||
total: 0,
|
||||
by_backend: {},
|
||||
by_source: {},
|
||||
forced_native_guards: {},
|
||||
};
|
||||
|
||||
const turnQueues = new Map<string, RouteTurn[]>();
|
||||
const turns: RouteTurn[] = [];
|
||||
const fallbackCategoryCounts = new Map<string, number>();
|
||||
const fallbackRawReasonCounts = new Map<string, number>();
|
||||
|
||||
const sortedEvents = [...events].sort((a, b) => a.timestamp - b.timestamp);
|
||||
|
||||
for (const event of sortedEvents) {
|
||||
const payload = toRecord(event.event);
|
||||
|
||||
if (event.event_type === 'backend.route') {
|
||||
const sessionId = readString(payload.session_id);
|
||||
const channel = readString(payload.channel);
|
||||
const sender = readString(payload.sender);
|
||||
const selectedBackend = payload.selected_backend;
|
||||
const source = payload.source;
|
||||
const guardReason = payload.guard_reason;
|
||||
|
||||
if (!sessionId || !channel || !sender || !isRoutedBackendName(selectedBackend) || !isRouteSource(source)) {
|
||||
continue;
|
||||
}
|
||||
if (sessionFilter.size > 0 && !sessionFilter.has(sessionId)) {
|
||||
continue;
|
||||
}
|
||||
if (channelFilter.size > 0 && !channelFilter.has(channel)) {
|
||||
continue;
|
||||
}
|
||||
if (senderFilter.size > 0 && !senderFilter.has(sender)) {
|
||||
continue;
|
||||
}
|
||||
if (sourceFilter.size > 0 && !sourceFilter.has(source)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const turn: RouteTurn = {
|
||||
sessionId,
|
||||
channel,
|
||||
sender,
|
||||
selectedBackend,
|
||||
source,
|
||||
guardReason: isGuardReason(guardReason) ? guardReason : undefined,
|
||||
routeTimestamp: event.timestamp,
|
||||
};
|
||||
|
||||
routeStats.total += 1;
|
||||
routeStats.by_backend[selectedBackend] = (routeStats.by_backend[selectedBackend] ?? 0) + 1;
|
||||
routeStats.by_source[source] = (routeStats.by_source[source] ?? 0) + 1;
|
||||
if (turn.guardReason) {
|
||||
routeStats.forced_native_guards[turn.guardReason] = (routeStats.forced_native_guards[turn.guardReason] ?? 0) + 1;
|
||||
}
|
||||
|
||||
const queue = turnQueues.get(sessionId) ?? [];
|
||||
queue.push(turn);
|
||||
turnQueues.set(sessionId, queue);
|
||||
turns.push(turn);
|
||||
continue;
|
||||
}
|
||||
|
||||
const sessionId = readString(payload.session_id);
|
||||
if (!sessionId) {
|
||||
continue;
|
||||
}
|
||||
const queue = turnQueues.get(sessionId);
|
||||
if (!queue || queue.length === 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (event.event_type === 'backend.success') {
|
||||
const backend = payload.backend;
|
||||
const durationMs = readNumber(payload.duration_ms);
|
||||
if (!isRoutedBackendName(backend) || backend === 'native') {
|
||||
continue;
|
||||
}
|
||||
const turn = queue.find((candidate) => (
|
||||
candidate.selectedBackend === backend
|
||||
&& candidate.externalOutcome === undefined
|
||||
));
|
||||
if (!turn) {
|
||||
continue;
|
||||
}
|
||||
turn.externalOutcome = 'success';
|
||||
turn.externalDurationMs = durationMs;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (event.event_type === 'backend.fallback') {
|
||||
const fromBackend = payload.from_backend;
|
||||
const reason = readString(payload.reason) ?? 'unknown';
|
||||
const durationMs = readNumber(payload.duration_ms);
|
||||
if (!isRoutedBackendName(fromBackend) || fromBackend === 'native') {
|
||||
continue;
|
||||
}
|
||||
const turn = queue.find((candidate) => (
|
||||
candidate.selectedBackend === fromBackend
|
||||
&& candidate.externalOutcome === undefined
|
||||
));
|
||||
if (!turn) {
|
||||
continue;
|
||||
}
|
||||
turn.externalOutcome = 'fallback';
|
||||
turn.externalDurationMs = durationMs;
|
||||
turn.fallbackReason = reason;
|
||||
|
||||
const category = normalizeFallbackCategory(reason);
|
||||
fallbackCategoryCounts.set(category, (fallbackCategoryCounts.get(category) ?? 0) + 1);
|
||||
const normalizedReason = normalizeFallbackReasonForDisplay(reason);
|
||||
fallbackRawReasonCounts.set(normalizedReason, (fallbackRawReasonCounts.get(normalizedReason) ?? 0) + 1);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (event.event_type === 'session.message') {
|
||||
const role = readString(payload.role);
|
||||
if (role !== 'assistant') {
|
||||
continue;
|
||||
}
|
||||
const turn = queue[0];
|
||||
if (!turn) {
|
||||
continue;
|
||||
}
|
||||
turn.assistantTimestamp = event.timestamp;
|
||||
queue.shift();
|
||||
}
|
||||
}
|
||||
|
||||
const targetTurns = turns.filter((turn) => turn.selectedBackend === options.targetBackend);
|
||||
const baselineTurns = turns.filter((turn) => turn.selectedBackend === options.baselineBackend);
|
||||
|
||||
const targetStats = buildBackendStats(options.targetBackend, targetTurns);
|
||||
const baselineStats = buildBackendStats(options.baselineBackend, baselineTurns);
|
||||
|
||||
const comparison: BackendComparisonStats = {
|
||||
completion_rate_delta_pp: (
|
||||
targetStats.completion_rate_pct !== null && baselineStats.completion_rate_pct !== null
|
||||
)
|
||||
? Math.round((targetStats.completion_rate_pct - baselineStats.completion_rate_pct) * 100) / 100
|
||||
: null,
|
||||
p50_latency_delta_ms: (
|
||||
targetStats.e2e_latency_ms && baselineStats.e2e_latency_ms
|
||||
)
|
||||
? targetStats.e2e_latency_ms.p50_ms - baselineStats.e2e_latency_ms.p50_ms
|
||||
: null,
|
||||
p95_latency_delta_ms: (
|
||||
targetStats.e2e_latency_ms && baselineStats.e2e_latency_ms
|
||||
)
|
||||
? targetStats.e2e_latency_ms.p95_ms - baselineStats.e2e_latency_ms.p95_ms
|
||||
: null,
|
||||
};
|
||||
|
||||
const targetExternalAttempts = options.targetBackend === 'native'
|
||||
? null
|
||||
: buildAttemptStats(targetTurns);
|
||||
|
||||
return {
|
||||
route_stats: routeStats,
|
||||
target: targetStats,
|
||||
baseline: baselineStats,
|
||||
target_external_attempts: targetExternalAttempts,
|
||||
comparison,
|
||||
fallback_categories: sortFallbackReasonStats(fallbackCategoryCounts),
|
||||
fallback_top_reasons: sortTopRawReasons(fallbackRawReasonCounts),
|
||||
};
|
||||
}
|
||||
|
||||
function formatPct(value: number | null): string {
|
||||
return value === null ? 'n/a' : `${value.toFixed(2)}%`;
|
||||
}
|
||||
|
||||
function formatMs(value: number | null): string {
|
||||
return value === null ? 'n/a' : `${value}ms`;
|
||||
}
|
||||
|
||||
export function evaluateBackendCanaryGate(
|
||||
summary: BackendCanarySummary,
|
||||
thresholds: BackendCanaryGateThresholds,
|
||||
): BackendCanaryGateResult {
|
||||
const criteria: BackendCanaryGateResult['criteria'] = [];
|
||||
|
||||
if (typeof thresholds.maxCompletionRateDropPp === 'number') {
|
||||
const delta = summary.comparison.completion_rate_delta_pp;
|
||||
const pass = delta !== null && delta >= (-thresholds.maxCompletionRateDropPp);
|
||||
criteria.push({
|
||||
criterion: 'Completion rate delta (target - baseline)',
|
||||
pass,
|
||||
actual: delta === null ? 'n/a' : `${delta.toFixed(2)}pp`,
|
||||
threshold: `>= -${thresholds.maxCompletionRateDropPp.toFixed(2)}pp`,
|
||||
});
|
||||
}
|
||||
|
||||
if (typeof thresholds.maxP50LatencyIncreaseMs === 'number') {
|
||||
const delta = summary.comparison.p50_latency_delta_ms;
|
||||
const pass = delta !== null && delta <= thresholds.maxP50LatencyIncreaseMs;
|
||||
criteria.push({
|
||||
criterion: 'P50 latency delta (target - baseline)',
|
||||
pass,
|
||||
actual: formatMs(delta),
|
||||
threshold: `<= ${thresholds.maxP50LatencyIncreaseMs}ms`,
|
||||
});
|
||||
}
|
||||
|
||||
if (typeof thresholds.maxP95LatencyIncreaseMs === 'number') {
|
||||
const delta = summary.comparison.p95_latency_delta_ms;
|
||||
const pass = delta !== null && delta <= thresholds.maxP95LatencyIncreaseMs;
|
||||
criteria.push({
|
||||
criterion: 'P95 latency delta (target - baseline)',
|
||||
pass,
|
||||
actual: formatMs(delta),
|
||||
threshold: `<= ${thresholds.maxP95LatencyIncreaseMs}ms`,
|
||||
});
|
||||
}
|
||||
|
||||
if (typeof thresholds.maxFallbackRatePct === 'number') {
|
||||
const fallbackRate = summary.target_external_attempts
|
||||
? toPct(summary.target_external_attempts.fallbacks, summary.target_external_attempts.attempts)
|
||||
: null;
|
||||
const pass = fallbackRate !== null && fallbackRate <= thresholds.maxFallbackRatePct;
|
||||
criteria.push({
|
||||
criterion: 'Fallback rate (target external attempts)',
|
||||
pass,
|
||||
actual: formatPct(fallbackRate),
|
||||
threshold: `<= ${thresholds.maxFallbackRatePct.toFixed(2)}%`,
|
||||
});
|
||||
}
|
||||
|
||||
return {
|
||||
pass: criteria.length > 0 ? criteria.every((criterion) => criterion.pass) : true,
|
||||
criteria,
|
||||
};
|
||||
}
|
||||
|
||||
function renderLatencyInline(label: string, stats: LatencyStats | null): string {
|
||||
if (!stats) {
|
||||
return `- ${label}: n/a`;
|
||||
}
|
||||
return `- ${label}: count=${stats.count}, avg=${stats.avg_ms}ms, p50=${stats.p50_ms}ms, p95=${stats.p95_ms}ms, min=${stats.min_ms}ms, max=${stats.max_ms}ms`;
|
||||
}
|
||||
|
||||
export function renderBackendCanaryMarkdown(
|
||||
summary: BackendCanarySummary,
|
||||
options: BackendCanarySummaryOptions,
|
||||
gate?: BackendCanaryGateResult,
|
||||
): string {
|
||||
const lines: string[] = [];
|
||||
|
||||
lines.push('# Pi Embedded Canary Summary');
|
||||
lines.push('');
|
||||
lines.push(`- Target backend: \`${options.targetBackend}\``);
|
||||
lines.push(`- Baseline backend: \`${options.baselineBackend}\``);
|
||||
lines.push(`- Routes analyzed: ${summary.route_stats.total}`);
|
||||
lines.push('');
|
||||
|
||||
lines.push('## Route Distribution');
|
||||
lines.push('');
|
||||
lines.push('| Backend | Routes |');
|
||||
lines.push('| --- | ---: |');
|
||||
const backendRows = Object.entries(summary.route_stats.by_backend)
|
||||
.sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]));
|
||||
for (const [backend, count] of backendRows) {
|
||||
lines.push(`| ${backend} | ${count} |`);
|
||||
}
|
||||
if (backendRows.length === 0) {
|
||||
lines.push('| _none_ | 0 |');
|
||||
}
|
||||
lines.push('');
|
||||
|
||||
lines.push('## Reliability');
|
||||
lines.push('');
|
||||
lines.push('| Metric | Target | Baseline | Delta |');
|
||||
lines.push('| --- | ---: | ---: | ---: |');
|
||||
lines.push(`| Turn completion rate | ${formatPct(summary.target.completion_rate_pct)} | ${formatPct(summary.baseline.completion_rate_pct)} | ${summary.comparison.completion_rate_delta_pp === null ? 'n/a' : `${summary.comparison.completion_rate_delta_pp.toFixed(2)}pp`} |`);
|
||||
if (summary.target_external_attempts) {
|
||||
lines.push(`| External success rate | ${formatPct(summary.target_external_attempts.success_rate_pct)} | n/a | n/a |`);
|
||||
lines.push(`| External attempts | ${summary.target_external_attempts.attempts} | n/a | n/a |`);
|
||||
lines.push(`| External fallbacks | ${summary.target_external_attempts.fallbacks} | n/a | n/a |`);
|
||||
}
|
||||
lines.push('');
|
||||
|
||||
lines.push('## Latency');
|
||||
lines.push('');
|
||||
lines.push(renderLatencyInline('Target end-to-end', summary.target.e2e_latency_ms));
|
||||
lines.push(renderLatencyInline('Baseline end-to-end', summary.baseline.e2e_latency_ms));
|
||||
lines.push(`- P50 delta (target - baseline): ${formatMs(summary.comparison.p50_latency_delta_ms)}`);
|
||||
lines.push(`- P95 delta (target - baseline): ${formatMs(summary.comparison.p95_latency_delta_ms)}`);
|
||||
if (summary.target_external_attempts) {
|
||||
lines.push(renderLatencyInline('Target external attempt', summary.target_external_attempts.attempt_latency_ms));
|
||||
}
|
||||
lines.push('');
|
||||
|
||||
lines.push('## Fallback Taxonomy');
|
||||
lines.push('');
|
||||
lines.push('| Category | Count | Percent |');
|
||||
lines.push('| --- | ---: | ---: |');
|
||||
if (summary.fallback_categories.length === 0) {
|
||||
lines.push('| _none_ | 0 | 0.00% |');
|
||||
} else {
|
||||
for (const item of summary.fallback_categories) {
|
||||
lines.push(`| ${item.category} | ${item.count} | ${item.pct.toFixed(2)}% |`);
|
||||
}
|
||||
}
|
||||
lines.push('');
|
||||
|
||||
lines.push('## Top Fallback Reasons');
|
||||
lines.push('');
|
||||
if (summary.fallback_top_reasons.length === 0) {
|
||||
lines.push('- none');
|
||||
} else {
|
||||
for (const item of summary.fallback_top_reasons) {
|
||||
lines.push(`- ${item.reason} (${item.count})`);
|
||||
}
|
||||
}
|
||||
lines.push('');
|
||||
|
||||
if (gate) {
|
||||
lines.push('## Gate Evaluation');
|
||||
lines.push('');
|
||||
lines.push(`- Gate result: ${gate.pass ? 'PASS' : 'HOLD'}`);
|
||||
for (const criterion of gate.criteria) {
|
||||
lines.push(`- [${criterion.pass ? 'x' : ' '}] ${criterion.criterion}: actual=${criterion.actual}, threshold=${criterion.threshold}`);
|
||||
}
|
||||
lines.push('');
|
||||
}
|
||||
|
||||
return lines.join('\n');
|
||||
}
|
||||
Reference in New Issue
Block a user