feat(audit): add pi canary summary analyzer and cli script

This commit is contained in:
William Valentin
2026-02-23 22:26:29 -08:00
parent 1dfa6ce2b4
commit afddd1ba7a
4 changed files with 1041 additions and 1 deletions
+2 -1
View File
@@ -19,7 +19,8 @@
"lint": "eslint src/",
"typecheck": "tsc --noEmit",
"config:profiles:generate": "node scripts/generate-config-profiles.mjs",
"config:profiles:check": "node scripts/generate-config-profiles.mjs --check"
"config:profiles:check": "node scripts/generate-config-profiles.mjs --check",
"audit:backend-canary": "node --import tsx/esm scripts/summarize-backend-canary.ts"
},
"keywords": [
"ai",
+208
View File
@@ -0,0 +1,208 @@
#!/usr/bin/env node
import { writeFile } from 'node:fs/promises';
import { parseArgs } from 'node:util';
import { queryAuditLogs } from '../src/audit/export.js';
import {
evaluateBackendCanaryGate,
renderBackendCanaryMarkdown,
summarizeBackendCanary,
type BackendCanaryGateThresholds,
type BackendCanarySummaryOptions,
type BackendRouteSource,
type RoutedBackendName,
} from '../src/audit/backendCanarySummary.js';
const DEFAULT_EVENT_TYPES = ['backend.route', 'backend.success', 'backend.fallback', 'session.message'] as const;
function usage(): string {
return [
'Usage: node --import tsx/esm scripts/summarize-backend-canary.ts --audit <path> [options]',
'',
'Options:',
' --audit <path> Path to audit.log (required)',
' --backend <name> Target backend (default: pi_embedded)',
' --baseline <name> Baseline backend (default: native)',
' --since <ISO-8601|epoch_ms> Start time filter',
' --until <ISO-8601|epoch_ms> End time filter',
' --session <id[,id...]> Restrict to session IDs',
' --channel <name[,name...]> Restrict to channels',
' --sender <id[,id...]> Restrict to senders',
' --source <name[,name...]> Restrict route sources (agent_override,default_external,native,forced_native_guard)',
' --format <markdown|json> Output format (default: markdown)',
' --out <path> Write output to file instead of stdout',
'',
'Gate options (optional):',
' --gate-max-completion-drop-pp <number>',
' --gate-max-p50-latency-increase-ms <number>',
' --gate-max-p95-latency-increase-ms <number>',
' --gate-max-fallback-rate-pct <number>',
].join('\n');
}
function parseTime(value: string | undefined, flag: string): number | undefined {
if (!value) {
return undefined;
}
if (/^\d+$/.test(value)) {
const asNumber = Number(value);
if (Number.isFinite(asNumber)) {
return asNumber;
}
}
const parsed = Date.parse(value);
if (!Number.isFinite(parsed)) {
throw new Error(`Invalid ${flag} value "${value}". Use ISO-8601 or epoch milliseconds.`);
}
return parsed;
}
function parseCsv(value: string | undefined): string[] | undefined {
if (!value) {
return undefined;
}
const values = value
.split(',')
.map((item) => item.trim())
.filter((item) => item.length > 0);
return values.length > 0 ? values : undefined;
}
function parseOptionalNumber(raw: string | undefined, flag: string): number | undefined {
if (!raw) {
return undefined;
}
const parsed = Number(raw);
if (!Number.isFinite(parsed)) {
throw new Error(`Invalid ${flag} value "${raw}". Expected a number.`);
}
return parsed;
}
function parseBackendName(raw: string | undefined, fallback: RoutedBackendName): RoutedBackendName {
const value = (raw ?? fallback).trim() as RoutedBackendName;
if (
value === 'native'
|| value === 'claude_code'
|| value === 'opencode'
|| value === 'codex'
|| value === 'gemini'
|| value === 'pi_embedded'
) {
return value;
}
throw new Error(`Invalid backend "${value}".`);
}
function parseSources(raw: string | undefined): BackendRouteSource[] | undefined {
const values = parseCsv(raw);
if (!values) {
return undefined;
}
const parsed: BackendRouteSource[] = [];
for (const value of values) {
if (value === 'agent_override' || value === 'default_external' || value === 'native' || value === 'forced_native_guard') {
parsed.push(value);
continue;
}
throw new Error(`Invalid source "${value}".`);
}
return parsed;
}
async function main(): Promise<void> {
const { values } = parseArgs({
options: {
audit: { type: 'string' },
backend: { type: 'string' },
baseline: { type: 'string' },
since: { type: 'string' },
until: { type: 'string' },
session: { type: 'string' },
channel: { type: 'string' },
sender: { type: 'string' },
source: { type: 'string' },
format: { type: 'string' },
out: { type: 'string' },
'gate-max-completion-drop-pp': { type: 'string' },
'gate-max-p50-latency-increase-ms': { type: 'string' },
'gate-max-p95-latency-increase-ms': { type: 'string' },
'gate-max-fallback-rate-pct': { type: 'string' },
help: { type: 'boolean', short: 'h' },
},
strict: true,
allowPositionals: false,
});
if (values.help) {
process.stdout.write(`${usage()}\n`);
return;
}
if (!values.audit) {
throw new Error('--audit is required.');
}
const format = values.format ?? 'markdown';
if (format !== 'markdown' && format !== 'json') {
throw new Error(`Invalid --format value "${format}".`);
}
const summaryOptions: BackendCanarySummaryOptions = {
targetBackend: parseBackendName(values.backend, 'pi_embedded'),
baselineBackend: parseBackendName(values.baseline, 'native'),
sessionIds: parseCsv(values.session),
channels: parseCsv(values.channel),
senders: parseCsv(values.sender),
routeSources: parseSources(values.source),
};
const startTime = parseTime(values.since, '--since');
const endTime = parseTime(values.until, '--until');
const events = await queryAuditLogs(values.audit, {
start_time: startTime,
end_time: endTime,
event_types: [...DEFAULT_EVENT_TYPES],
});
const summary = summarizeBackendCanary(events, summaryOptions);
const gateThresholds: BackendCanaryGateThresholds = {
maxCompletionRateDropPp: parseOptionalNumber(values['gate-max-completion-drop-pp'], '--gate-max-completion-drop-pp'),
maxP50LatencyIncreaseMs: parseOptionalNumber(values['gate-max-p50-latency-increase-ms'], '--gate-max-p50-latency-increase-ms'),
maxP95LatencyIncreaseMs: parseOptionalNumber(values['gate-max-p95-latency-increase-ms'], '--gate-max-p95-latency-increase-ms'),
maxFallbackRatePct: parseOptionalNumber(values['gate-max-fallback-rate-pct'], '--gate-max-fallback-rate-pct'),
};
const hasGateThreshold = Object.values(gateThresholds).some((value) => typeof value === 'number');
const gateResult = hasGateThreshold ? evaluateBackendCanaryGate(summary, gateThresholds) : undefined;
const output = format === 'json'
? JSON.stringify({
generated_at: new Date().toISOString(),
event_count: events.length,
filters: {
since_ms: startTime,
until_ms: endTime,
},
options: summaryOptions,
summary,
gate: gateResult,
}, null, 2)
: renderBackendCanaryMarkdown(summary, summaryOptions, gateResult);
if (values.out) {
await writeFile(values.out, `${output}\n`, 'utf-8');
} else {
process.stdout.write(`${output}\n`);
}
}
main().catch((error) => {
const message = error instanceof Error ? error.message : String(error);
process.stderr.write(`${message}\n\n${usage()}\n`);
process.exitCode = 1;
});
+213
View File
@@ -0,0 +1,213 @@
import { describe, expect, it } from 'vitest';
import type { AuditEvent } from './types.js';
import {
evaluateBackendCanaryGate,
renderBackendCanaryMarkdown,
summarizeBackendCanary,
} from './backendCanarySummary.js';
function makeEvent(
timestamp: number,
event_type: AuditEvent['event_type'],
event: Record<string, unknown>,
): AuditEvent {
return {
timestamp,
level: 'info',
event_type,
event,
};
}
describe('summarizeBackendCanary', () => {
it('computes route, reliability, latency, and fallback summaries', () => {
const events: AuditEvent[] = [
makeEvent(1000, 'backend.route', {
session_id: 'telegram:canary',
channel: 'telegram',
sender: '8367012007',
selected_backend: 'pi_embedded',
source: 'agent_override',
}),
makeEvent(1120, 'backend.success', {
session_id: 'telegram:canary',
channel: 'telegram',
sender: '8367012007',
backend: 'pi_embedded',
duration_ms: 120,
response_length: 50,
}),
makeEvent(1140, 'session.message', {
session_id: 'telegram:canary',
role: 'assistant',
content_length: 50,
}),
makeEvent(2000, 'backend.route', {
session_id: 'telegram:canary',
channel: 'telegram',
sender: '8367012007',
selected_backend: 'pi_embedded',
source: 'agent_override',
}),
makeEvent(2300, 'backend.fallback', {
session_id: 'telegram:canary',
channel: 'telegram',
sender: '8367012007',
from_backend: 'pi_embedded',
to_backend: 'native',
reason: 'request timed out waiting for backend process',
duration_ms: 300,
}),
makeEvent(2340, 'session.message', {
session_id: 'telegram:canary',
role: 'assistant',
content_length: 80,
}),
makeEvent(3000, 'backend.route', {
session_id: 'telegram:control',
channel: 'telegram',
sender: '123',
selected_backend: 'native',
source: 'native',
}),
makeEvent(3080, 'session.message', {
session_id: 'telegram:control',
role: 'assistant',
content_length: 25,
}),
];
const summary = summarizeBackendCanary(events, {
targetBackend: 'pi_embedded',
baselineBackend: 'native',
});
expect(summary.route_stats.total).toBe(3);
expect(summary.route_stats.by_backend.pi_embedded).toBe(2);
expect(summary.route_stats.by_backend.native).toBe(1);
expect(summary.target.routes).toBe(2);
expect(summary.target.completed_turns).toBe(2);
expect(summary.target.completion_rate_pct).toBe(100);
expect(summary.target.e2e_latency_ms?.p50_ms).toBe(240);
expect(summary.target.e2e_latency_ms?.p95_ms).toBe(330);
expect(summary.baseline.routes).toBe(1);
expect(summary.baseline.completion_rate_pct).toBe(100);
expect(summary.baseline.e2e_latency_ms?.p50_ms).toBe(80);
expect(summary.target_external_attempts?.attempts).toBe(2);
expect(summary.target_external_attempts?.successes).toBe(1);
expect(summary.target_external_attempts?.fallbacks).toBe(1);
expect(summary.target_external_attempts?.success_rate_pct).toBe(50);
expect(summary.target_external_attempts?.attempt_latency_ms?.p50_ms).toBe(210);
expect(summary.comparison.p50_latency_delta_ms).toBe(160);
expect(summary.comparison.p95_latency_delta_ms).toBe(250);
expect(summary.fallback_categories).toEqual([
{ category: 'timeout', count: 1, pct: 100 },
]);
expect(summary.fallback_top_reasons[0]?.reason).toContain('request timed out');
});
it('filters routes by session id', () => {
const events: AuditEvent[] = [
makeEvent(1000, 'backend.route', {
session_id: 'telegram:canary',
channel: 'telegram',
sender: '8367012007',
selected_backend: 'pi_embedded',
source: 'agent_override',
}),
makeEvent(1100, 'session.message', {
session_id: 'telegram:canary',
role: 'assistant',
content_length: 10,
}),
makeEvent(2000, 'backend.route', {
session_id: 'telegram:other',
channel: 'telegram',
sender: '9999',
selected_backend: 'native',
source: 'native',
}),
makeEvent(2100, 'session.message', {
session_id: 'telegram:other',
role: 'assistant',
content_length: 10,
}),
];
const summary = summarizeBackendCanary(events, {
targetBackend: 'pi_embedded',
baselineBackend: 'native',
sessionIds: ['telegram:canary'],
});
expect(summary.route_stats.total).toBe(1);
expect(summary.target.routes).toBe(1);
expect(summary.baseline.routes).toBe(0);
});
});
describe('evaluateBackendCanaryGate', () => {
it('evaluates configured pass/fail thresholds', () => {
const events: AuditEvent[] = [
makeEvent(1000, 'backend.route', {
session_id: 's1',
channel: 'telegram',
sender: '1',
selected_backend: 'pi_embedded',
source: 'agent_override',
}),
makeEvent(1200, 'backend.success', {
session_id: 's1',
channel: 'telegram',
sender: '1',
backend: 'pi_embedded',
duration_ms: 200,
response_length: 10,
}),
makeEvent(1250, 'session.message', {
session_id: 's1',
role: 'assistant',
content_length: 20,
}),
makeEvent(2000, 'backend.route', {
session_id: 's2',
channel: 'telegram',
sender: '2',
selected_backend: 'native',
source: 'native',
}),
makeEvent(2050, 'session.message', {
session_id: 's2',
role: 'assistant',
content_length: 20,
}),
];
const summary = summarizeBackendCanary(events, {
targetBackend: 'pi_embedded',
baselineBackend: 'native',
});
const gate = evaluateBackendCanaryGate(summary, {
maxCompletionRateDropPp: 0,
maxP50LatencyIncreaseMs: 300,
maxP95LatencyIncreaseMs: 300,
maxFallbackRatePct: 5,
});
expect(gate.pass).toBe(true);
expect(gate.criteria).toHaveLength(4);
const markdown = renderBackendCanaryMarkdown(summary, {
targetBackend: 'pi_embedded',
baselineBackend: 'native',
}, gate);
expect(markdown).toContain('Pi Embedded Canary Summary');
expect(markdown).toContain('Gate result: PASS');
});
});
+618
View File
@@ -0,0 +1,618 @@
import type { ExternalBackendName } from '../backends/external.js';
import type { AuditEvent } from './types.js';
export type RoutedBackendName = 'native' | ExternalBackendName;
export type BackendRouteSource = 'agent_override' | 'default_external' | 'native' | 'forced_native_guard';
export type BackendGuardReason = 'capability_query' | 'pi_no_tools_mode' | 'attachments_present';
export interface BackendCanarySummaryOptions {
targetBackend: RoutedBackendName;
baselineBackend: RoutedBackendName;
sessionIds?: string[];
channels?: string[];
senders?: string[];
routeSources?: BackendRouteSource[];
}
export interface LatencyStats {
count: number;
avg_ms: number;
p50_ms: number;
p95_ms: number;
min_ms: number;
max_ms: number;
}
export interface RouteStats {
total: number;
by_backend: Partial<Record<RoutedBackendName, number>>;
by_source: Partial<Record<BackendRouteSource, number>>;
forced_native_guards: Partial<Record<BackendGuardReason, number>>;
}
export interface BackendStats {
backend: RoutedBackendName;
routes: number;
completed_turns: number;
incomplete_turns: number;
completion_rate_pct: number | null;
e2e_latency_ms: LatencyStats | null;
}
export interface ExternalBackendAttemptStats {
attempts: number;
successes: number;
fallbacks: number;
unresolved_attempts: number;
success_rate_pct: number | null;
attempt_latency_ms: LatencyStats | null;
}
export interface FallbackReasonStats {
category: string;
count: number;
pct: number;
}
export interface FallbackRawReasonStats {
reason: string;
count: number;
}
export interface BackendComparisonStats {
completion_rate_delta_pp: number | null;
p50_latency_delta_ms: number | null;
p95_latency_delta_ms: number | null;
}
export interface BackendCanarySummary {
route_stats: RouteStats;
target: BackendStats;
baseline: BackendStats;
target_external_attempts: ExternalBackendAttemptStats | null;
comparison: BackendComparisonStats;
fallback_categories: FallbackReasonStats[];
fallback_top_reasons: FallbackRawReasonStats[];
}
export interface BackendCanaryGateThresholds {
maxCompletionRateDropPp?: number;
maxP50LatencyIncreaseMs?: number;
maxP95LatencyIncreaseMs?: number;
maxFallbackRatePct?: number;
}
export interface BackendCanaryGateResult {
pass: boolean;
criteria: Array<{ criterion: string; pass: boolean; actual: string; threshold: string }>;
}
interface RouteTurn {
sessionId: string;
channel: string;
sender: string;
selectedBackend: RoutedBackendName;
source: BackendRouteSource;
guardReason?: BackendGuardReason;
routeTimestamp: number;
externalOutcome?: 'success' | 'fallback';
externalDurationMs?: number;
fallbackReason?: string;
assistantTimestamp?: number;
}
function toRecord(value: unknown): Record<string, unknown> {
return (value && typeof value === 'object') ? value as Record<string, unknown> : {};
}
function readString(value: unknown): string | undefined {
return typeof value === 'string' ? value : undefined;
}
function readNumber(value: unknown): number | undefined {
return typeof value === 'number' && Number.isFinite(value) ? value : undefined;
}
function isRouteSource(value: unknown): value is BackendRouteSource {
return value === 'agent_override' || value === 'default_external' || value === 'native' || value === 'forced_native_guard';
}
function isGuardReason(value: unknown): value is BackendGuardReason {
return value === 'capability_query' || value === 'pi_no_tools_mode' || value === 'attachments_present';
}
function isRoutedBackendName(value: unknown): value is RoutedBackendName {
return value === 'native'
|| value === 'claude_code'
|| value === 'opencode'
|| value === 'codex'
|| value === 'gemini'
|| value === 'pi_embedded';
}
function percentile(sortedAscending: number[], pct: number): number {
if (sortedAscending.length === 0) {
return 0;
}
if (sortedAscending.length === 1) {
return sortedAscending[0];
}
const clampedPct = Math.max(0, Math.min(100, pct));
const position = (clampedPct / 100) * (sortedAscending.length - 1);
const lowerIndex = Math.floor(position);
const upperIndex = Math.ceil(position);
if (lowerIndex === upperIndex) {
return sortedAscending[lowerIndex] ?? 0;
}
const lower = sortedAscending[lowerIndex] ?? 0;
const upper = sortedAscending[upperIndex] ?? 0;
const weight = position - lowerIndex;
return lower + ((upper - lower) * weight);
}
function computeLatencyStats(samples: number[]): LatencyStats | null {
if (samples.length === 0) {
return null;
}
const sorted = [...samples].sort((a, b) => a - b);
const total = sorted.reduce((sum, value) => sum + value, 0);
return {
count: sorted.length,
avg_ms: Math.round(total / sorted.length),
p50_ms: Math.round(percentile(sorted, 50)),
p95_ms: Math.round(percentile(sorted, 95)),
min_ms: sorted[0] ?? 0,
max_ms: sorted[sorted.length - 1] ?? 0,
};
}
function toPct(part: number, whole: number): number | null {
if (whole <= 0) {
return null;
}
return Math.round((part / whole) * 10000) / 100;
}
function normalizeFallbackCategory(reason: string): string {
const normalized = reason.trim().toLowerCase();
if (!normalized) {
return 'unknown';
}
if (normalized.includes('timeout') || normalized.includes('timed out')) {
return 'timeout';
}
if (normalized.includes('abort') || normalized.includes('cancel')) {
return 'cancelled';
}
if (normalized.includes('rate limit') || normalized.includes('429')) {
return 'rate_limit';
}
if (
normalized.includes('unauthorized')
|| normalized.includes('forbidden')
|| normalized.includes('401')
|| normalized.includes('403')
|| normalized.includes('api key')
) {
return 'auth';
}
if (
normalized.includes('fetch failed')
|| normalized.includes('network')
|| normalized.includes('socket')
|| normalized.includes('econn')
|| normalized.includes('enotfound')
|| normalized.includes('connect')
) {
return 'network';
}
if (normalized.includes('tool') || normalized.includes('capability')) {
return 'tool_or_capability';
}
if (normalized.includes('json') || normalized.includes('parse') || normalized.includes('format')) {
return 'response_format';
}
return normalized.slice(0, 80);
}
function normalizeFallbackReasonForDisplay(reason: string): string {
const singleLine = reason.replace(/\s+/g, ' ').trim();
if (!singleLine) {
return 'unknown';
}
return singleLine.slice(0, 140);
}
function buildBackendStats(backend: RoutedBackendName, turns: RouteTurn[]): BackendStats {
const completed = turns.filter((turn) => typeof turn.assistantTimestamp === 'number');
const latencies = completed
.map((turn) => (turn.assistantTimestamp as number) - turn.routeTimestamp)
.filter((value) => value >= 0);
return {
backend,
routes: turns.length,
completed_turns: completed.length,
incomplete_turns: turns.length - completed.length,
completion_rate_pct: toPct(completed.length, turns.length),
e2e_latency_ms: computeLatencyStats(latencies),
};
}
function buildAttemptStats(turns: RouteTurn[]): ExternalBackendAttemptStats {
const attempts = turns.filter((turn) => turn.externalOutcome !== undefined);
const successes = attempts.filter((turn) => turn.externalOutcome === 'success');
const fallbacks = attempts.filter((turn) => turn.externalOutcome === 'fallback');
const unresolved = turns.length - attempts.length;
const durationSamples = attempts
.map((turn) => turn.externalDurationMs)
.filter((value): value is number => typeof value === 'number' && value >= 0);
return {
attempts: attempts.length,
successes: successes.length,
fallbacks: fallbacks.length,
unresolved_attempts: unresolved,
success_rate_pct: toPct(successes.length, attempts.length),
attempt_latency_ms: computeLatencyStats(durationSamples),
};
}
function sortFallbackReasonStats(categoryCounts: Map<string, number>): FallbackReasonStats[] {
const total = [...categoryCounts.values()].reduce((sum, value) => sum + value, 0);
return [...categoryCounts.entries()]
.sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]))
.map(([category, count]) => ({
category,
count,
pct: total > 0 ? Math.round((count / total) * 10000) / 100 : 0,
}));
}
function sortTopRawReasons(rawReasonCounts: Map<string, number>, limit = 10): FallbackRawReasonStats[] {
return [...rawReasonCounts.entries()]
.sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]))
.slice(0, limit)
.map(([reason, count]) => ({ reason, count }));
}
export function summarizeBackendCanary(events: AuditEvent[], options: BackendCanarySummaryOptions): BackendCanarySummary {
const sessionFilter = new Set(options.sessionIds ?? []);
const channelFilter = new Set(options.channels ?? []);
const senderFilter = new Set(options.senders ?? []);
const sourceFilter = new Set(options.routeSources ?? []);
const routeStats: RouteStats = {
total: 0,
by_backend: {},
by_source: {},
forced_native_guards: {},
};
const turnQueues = new Map<string, RouteTurn[]>();
const turns: RouteTurn[] = [];
const fallbackCategoryCounts = new Map<string, number>();
const fallbackRawReasonCounts = new Map<string, number>();
const sortedEvents = [...events].sort((a, b) => a.timestamp - b.timestamp);
for (const event of sortedEvents) {
const payload = toRecord(event.event);
if (event.event_type === 'backend.route') {
const sessionId = readString(payload.session_id);
const channel = readString(payload.channel);
const sender = readString(payload.sender);
const selectedBackend = payload.selected_backend;
const source = payload.source;
const guardReason = payload.guard_reason;
if (!sessionId || !channel || !sender || !isRoutedBackendName(selectedBackend) || !isRouteSource(source)) {
continue;
}
if (sessionFilter.size > 0 && !sessionFilter.has(sessionId)) {
continue;
}
if (channelFilter.size > 0 && !channelFilter.has(channel)) {
continue;
}
if (senderFilter.size > 0 && !senderFilter.has(sender)) {
continue;
}
if (sourceFilter.size > 0 && !sourceFilter.has(source)) {
continue;
}
const turn: RouteTurn = {
sessionId,
channel,
sender,
selectedBackend,
source,
guardReason: isGuardReason(guardReason) ? guardReason : undefined,
routeTimestamp: event.timestamp,
};
routeStats.total += 1;
routeStats.by_backend[selectedBackend] = (routeStats.by_backend[selectedBackend] ?? 0) + 1;
routeStats.by_source[source] = (routeStats.by_source[source] ?? 0) + 1;
if (turn.guardReason) {
routeStats.forced_native_guards[turn.guardReason] = (routeStats.forced_native_guards[turn.guardReason] ?? 0) + 1;
}
const queue = turnQueues.get(sessionId) ?? [];
queue.push(turn);
turnQueues.set(sessionId, queue);
turns.push(turn);
continue;
}
const sessionId = readString(payload.session_id);
if (!sessionId) {
continue;
}
const queue = turnQueues.get(sessionId);
if (!queue || queue.length === 0) {
continue;
}
if (event.event_type === 'backend.success') {
const backend = payload.backend;
const durationMs = readNumber(payload.duration_ms);
if (!isRoutedBackendName(backend) || backend === 'native') {
continue;
}
const turn = queue.find((candidate) => (
candidate.selectedBackend === backend
&& candidate.externalOutcome === undefined
));
if (!turn) {
continue;
}
turn.externalOutcome = 'success';
turn.externalDurationMs = durationMs;
continue;
}
if (event.event_type === 'backend.fallback') {
const fromBackend = payload.from_backend;
const reason = readString(payload.reason) ?? 'unknown';
const durationMs = readNumber(payload.duration_ms);
if (!isRoutedBackendName(fromBackend) || fromBackend === 'native') {
continue;
}
const turn = queue.find((candidate) => (
candidate.selectedBackend === fromBackend
&& candidate.externalOutcome === undefined
));
if (!turn) {
continue;
}
turn.externalOutcome = 'fallback';
turn.externalDurationMs = durationMs;
turn.fallbackReason = reason;
const category = normalizeFallbackCategory(reason);
fallbackCategoryCounts.set(category, (fallbackCategoryCounts.get(category) ?? 0) + 1);
const normalizedReason = normalizeFallbackReasonForDisplay(reason);
fallbackRawReasonCounts.set(normalizedReason, (fallbackRawReasonCounts.get(normalizedReason) ?? 0) + 1);
continue;
}
if (event.event_type === 'session.message') {
const role = readString(payload.role);
if (role !== 'assistant') {
continue;
}
const turn = queue[0];
if (!turn) {
continue;
}
turn.assistantTimestamp = event.timestamp;
queue.shift();
}
}
const targetTurns = turns.filter((turn) => turn.selectedBackend === options.targetBackend);
const baselineTurns = turns.filter((turn) => turn.selectedBackend === options.baselineBackend);
const targetStats = buildBackendStats(options.targetBackend, targetTurns);
const baselineStats = buildBackendStats(options.baselineBackend, baselineTurns);
const comparison: BackendComparisonStats = {
completion_rate_delta_pp: (
targetStats.completion_rate_pct !== null && baselineStats.completion_rate_pct !== null
)
? Math.round((targetStats.completion_rate_pct - baselineStats.completion_rate_pct) * 100) / 100
: null,
p50_latency_delta_ms: (
targetStats.e2e_latency_ms && baselineStats.e2e_latency_ms
)
? targetStats.e2e_latency_ms.p50_ms - baselineStats.e2e_latency_ms.p50_ms
: null,
p95_latency_delta_ms: (
targetStats.e2e_latency_ms && baselineStats.e2e_latency_ms
)
? targetStats.e2e_latency_ms.p95_ms - baselineStats.e2e_latency_ms.p95_ms
: null,
};
const targetExternalAttempts = options.targetBackend === 'native'
? null
: buildAttemptStats(targetTurns);
return {
route_stats: routeStats,
target: targetStats,
baseline: baselineStats,
target_external_attempts: targetExternalAttempts,
comparison,
fallback_categories: sortFallbackReasonStats(fallbackCategoryCounts),
fallback_top_reasons: sortTopRawReasons(fallbackRawReasonCounts),
};
}
function formatPct(value: number | null): string {
return value === null ? 'n/a' : `${value.toFixed(2)}%`;
}
function formatMs(value: number | null): string {
return value === null ? 'n/a' : `${value}ms`;
}
export function evaluateBackendCanaryGate(
summary: BackendCanarySummary,
thresholds: BackendCanaryGateThresholds,
): BackendCanaryGateResult {
const criteria: BackendCanaryGateResult['criteria'] = [];
if (typeof thresholds.maxCompletionRateDropPp === 'number') {
const delta = summary.comparison.completion_rate_delta_pp;
const pass = delta !== null && delta >= (-thresholds.maxCompletionRateDropPp);
criteria.push({
criterion: 'Completion rate delta (target - baseline)',
pass,
actual: delta === null ? 'n/a' : `${delta.toFixed(2)}pp`,
threshold: `>= -${thresholds.maxCompletionRateDropPp.toFixed(2)}pp`,
});
}
if (typeof thresholds.maxP50LatencyIncreaseMs === 'number') {
const delta = summary.comparison.p50_latency_delta_ms;
const pass = delta !== null && delta <= thresholds.maxP50LatencyIncreaseMs;
criteria.push({
criterion: 'P50 latency delta (target - baseline)',
pass,
actual: formatMs(delta),
threshold: `<= ${thresholds.maxP50LatencyIncreaseMs}ms`,
});
}
if (typeof thresholds.maxP95LatencyIncreaseMs === 'number') {
const delta = summary.comparison.p95_latency_delta_ms;
const pass = delta !== null && delta <= thresholds.maxP95LatencyIncreaseMs;
criteria.push({
criterion: 'P95 latency delta (target - baseline)',
pass,
actual: formatMs(delta),
threshold: `<= ${thresholds.maxP95LatencyIncreaseMs}ms`,
});
}
if (typeof thresholds.maxFallbackRatePct === 'number') {
const fallbackRate = summary.target_external_attempts
? toPct(summary.target_external_attempts.fallbacks, summary.target_external_attempts.attempts)
: null;
const pass = fallbackRate !== null && fallbackRate <= thresholds.maxFallbackRatePct;
criteria.push({
criterion: 'Fallback rate (target external attempts)',
pass,
actual: formatPct(fallbackRate),
threshold: `<= ${thresholds.maxFallbackRatePct.toFixed(2)}%`,
});
}
return {
pass: criteria.length > 0 ? criteria.every((criterion) => criterion.pass) : true,
criteria,
};
}
function renderLatencyInline(label: string, stats: LatencyStats | null): string {
if (!stats) {
return `- ${label}: n/a`;
}
return `- ${label}: count=${stats.count}, avg=${stats.avg_ms}ms, p50=${stats.p50_ms}ms, p95=${stats.p95_ms}ms, min=${stats.min_ms}ms, max=${stats.max_ms}ms`;
}
export function renderBackendCanaryMarkdown(
summary: BackendCanarySummary,
options: BackendCanarySummaryOptions,
gate?: BackendCanaryGateResult,
): string {
const lines: string[] = [];
lines.push('# Pi Embedded Canary Summary');
lines.push('');
lines.push(`- Target backend: \`${options.targetBackend}\``);
lines.push(`- Baseline backend: \`${options.baselineBackend}\``);
lines.push(`- Routes analyzed: ${summary.route_stats.total}`);
lines.push('');
lines.push('## Route Distribution');
lines.push('');
lines.push('| Backend | Routes |');
lines.push('| --- | ---: |');
const backendRows = Object.entries(summary.route_stats.by_backend)
.sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]));
for (const [backend, count] of backendRows) {
lines.push(`| ${backend} | ${count} |`);
}
if (backendRows.length === 0) {
lines.push('| _none_ | 0 |');
}
lines.push('');
lines.push('## Reliability');
lines.push('');
lines.push('| Metric | Target | Baseline | Delta |');
lines.push('| --- | ---: | ---: | ---: |');
lines.push(`| Turn completion rate | ${formatPct(summary.target.completion_rate_pct)} | ${formatPct(summary.baseline.completion_rate_pct)} | ${summary.comparison.completion_rate_delta_pp === null ? 'n/a' : `${summary.comparison.completion_rate_delta_pp.toFixed(2)}pp`} |`);
if (summary.target_external_attempts) {
lines.push(`| External success rate | ${formatPct(summary.target_external_attempts.success_rate_pct)} | n/a | n/a |`);
lines.push(`| External attempts | ${summary.target_external_attempts.attempts} | n/a | n/a |`);
lines.push(`| External fallbacks | ${summary.target_external_attempts.fallbacks} | n/a | n/a |`);
}
lines.push('');
lines.push('## Latency');
lines.push('');
lines.push(renderLatencyInline('Target end-to-end', summary.target.e2e_latency_ms));
lines.push(renderLatencyInline('Baseline end-to-end', summary.baseline.e2e_latency_ms));
lines.push(`- P50 delta (target - baseline): ${formatMs(summary.comparison.p50_latency_delta_ms)}`);
lines.push(`- P95 delta (target - baseline): ${formatMs(summary.comparison.p95_latency_delta_ms)}`);
if (summary.target_external_attempts) {
lines.push(renderLatencyInline('Target external attempt', summary.target_external_attempts.attempt_latency_ms));
}
lines.push('');
lines.push('## Fallback Taxonomy');
lines.push('');
lines.push('| Category | Count | Percent |');
lines.push('| --- | ---: | ---: |');
if (summary.fallback_categories.length === 0) {
lines.push('| _none_ | 0 | 0.00% |');
} else {
for (const item of summary.fallback_categories) {
lines.push(`| ${item.category} | ${item.count} | ${item.pct.toFixed(2)}% |`);
}
}
lines.push('');
lines.push('## Top Fallback Reasons');
lines.push('');
if (summary.fallback_top_reasons.length === 0) {
lines.push('- none');
} else {
for (const item of summary.fallback_top_reasons) {
lines.push(`- ${item.reason} (${item.count})`);
}
}
lines.push('');
if (gate) {
lines.push('## Gate Evaluation');
lines.push('');
lines.push(`- Gate result: ${gate.pass ? 'PASS' : 'HOLD'}`);
for (const criterion of gate.criteria) {
lines.push(`- [${criterion.pass ? 'x' : ' '}] ${criterion.criterion}: actual=${criterion.actual}, threshold=${criterion.threshold}`);
}
lines.push('');
}
return lines.join('\n');
}