diff --git a/package.json b/package.json index 3ea5bd4..6029da6 100644 --- a/package.json +++ b/package.json @@ -19,7 +19,8 @@ "lint": "eslint src/", "typecheck": "tsc --noEmit", "config:profiles:generate": "node scripts/generate-config-profiles.mjs", - "config:profiles:check": "node scripts/generate-config-profiles.mjs --check" + "config:profiles:check": "node scripts/generate-config-profiles.mjs --check", + "audit:backend-canary": "node --import tsx/esm scripts/summarize-backend-canary.ts" }, "keywords": [ "ai", diff --git a/scripts/summarize-backend-canary.ts b/scripts/summarize-backend-canary.ts new file mode 100755 index 0000000..c7cb8d0 --- /dev/null +++ b/scripts/summarize-backend-canary.ts @@ -0,0 +1,208 @@ +#!/usr/bin/env node + +import { writeFile } from 'node:fs/promises'; +import { parseArgs } from 'node:util'; +import { queryAuditLogs } from '../src/audit/export.js'; +import { + evaluateBackendCanaryGate, + renderBackendCanaryMarkdown, + summarizeBackendCanary, + type BackendCanaryGateThresholds, + type BackendCanarySummaryOptions, + type BackendRouteSource, + type RoutedBackendName, +} from '../src/audit/backendCanarySummary.js'; + +const DEFAULT_EVENT_TYPES = ['backend.route', 'backend.success', 'backend.fallback', 'session.message'] as const; + +function usage(): string { + return [ + 'Usage: node --import tsx/esm scripts/summarize-backend-canary.ts --audit [options]', + '', + 'Options:', + ' --audit Path to audit.log (required)', + ' --backend Target backend (default: pi_embedded)', + ' --baseline Baseline backend (default: native)', + ' --since Start time filter', + ' --until End time filter', + ' --session Restrict to session IDs', + ' --channel Restrict to channels', + ' --sender Restrict to senders', + ' --source Restrict route sources (agent_override,default_external,native,forced_native_guard)', + ' --format Output format (default: markdown)', + ' --out Write output to file instead of stdout', + '', + 'Gate options (optional):', + ' --gate-max-completion-drop-pp ', + ' --gate-max-p50-latency-increase-ms ', + ' --gate-max-p95-latency-increase-ms ', + ' --gate-max-fallback-rate-pct ', + ].join('\n'); +} + +function parseTime(value: string | undefined, flag: string): number | undefined { + if (!value) { + return undefined; + } + if (/^\d+$/.test(value)) { + const asNumber = Number(value); + if (Number.isFinite(asNumber)) { + return asNumber; + } + } + const parsed = Date.parse(value); + if (!Number.isFinite(parsed)) { + throw new Error(`Invalid ${flag} value "${value}". Use ISO-8601 or epoch milliseconds.`); + } + return parsed; +} + +function parseCsv(value: string | undefined): string[] | undefined { + if (!value) { + return undefined; + } + const values = value + .split(',') + .map((item) => item.trim()) + .filter((item) => item.length > 0); + return values.length > 0 ? values : undefined; +} + +function parseOptionalNumber(raw: string | undefined, flag: string): number | undefined { + if (!raw) { + return undefined; + } + const parsed = Number(raw); + if (!Number.isFinite(parsed)) { + throw new Error(`Invalid ${flag} value "${raw}". Expected a number.`); + } + return parsed; +} + +function parseBackendName(raw: string | undefined, fallback: RoutedBackendName): RoutedBackendName { + const value = (raw ?? fallback).trim() as RoutedBackendName; + if ( + value === 'native' + || value === 'claude_code' + || value === 'opencode' + || value === 'codex' + || value === 'gemini' + || value === 'pi_embedded' + ) { + return value; + } + throw new Error(`Invalid backend "${value}".`); +} + +function parseSources(raw: string | undefined): BackendRouteSource[] | undefined { + const values = parseCsv(raw); + if (!values) { + return undefined; + } + + const parsed: BackendRouteSource[] = []; + for (const value of values) { + if (value === 'agent_override' || value === 'default_external' || value === 'native' || value === 'forced_native_guard') { + parsed.push(value); + continue; + } + throw new Error(`Invalid source "${value}".`); + } + + return parsed; +} + +async function main(): Promise { + const { values } = parseArgs({ + options: { + audit: { type: 'string' }, + backend: { type: 'string' }, + baseline: { type: 'string' }, + since: { type: 'string' }, + until: { type: 'string' }, + session: { type: 'string' }, + channel: { type: 'string' }, + sender: { type: 'string' }, + source: { type: 'string' }, + format: { type: 'string' }, + out: { type: 'string' }, + 'gate-max-completion-drop-pp': { type: 'string' }, + 'gate-max-p50-latency-increase-ms': { type: 'string' }, + 'gate-max-p95-latency-increase-ms': { type: 'string' }, + 'gate-max-fallback-rate-pct': { type: 'string' }, + help: { type: 'boolean', short: 'h' }, + }, + strict: true, + allowPositionals: false, + }); + + if (values.help) { + process.stdout.write(`${usage()}\n`); + return; + } + + if (!values.audit) { + throw new Error('--audit is required.'); + } + + const format = values.format ?? 'markdown'; + if (format !== 'markdown' && format !== 'json') { + throw new Error(`Invalid --format value "${format}".`); + } + + const summaryOptions: BackendCanarySummaryOptions = { + targetBackend: parseBackendName(values.backend, 'pi_embedded'), + baselineBackend: parseBackendName(values.baseline, 'native'), + sessionIds: parseCsv(values.session), + channels: parseCsv(values.channel), + senders: parseCsv(values.sender), + routeSources: parseSources(values.source), + }; + + const startTime = parseTime(values.since, '--since'); + const endTime = parseTime(values.until, '--until'); + + const events = await queryAuditLogs(values.audit, { + start_time: startTime, + end_time: endTime, + event_types: [...DEFAULT_EVENT_TYPES], + }); + + const summary = summarizeBackendCanary(events, summaryOptions); + + const gateThresholds: BackendCanaryGateThresholds = { + maxCompletionRateDropPp: parseOptionalNumber(values['gate-max-completion-drop-pp'], '--gate-max-completion-drop-pp'), + maxP50LatencyIncreaseMs: parseOptionalNumber(values['gate-max-p50-latency-increase-ms'], '--gate-max-p50-latency-increase-ms'), + maxP95LatencyIncreaseMs: parseOptionalNumber(values['gate-max-p95-latency-increase-ms'], '--gate-max-p95-latency-increase-ms'), + maxFallbackRatePct: parseOptionalNumber(values['gate-max-fallback-rate-pct'], '--gate-max-fallback-rate-pct'), + }; + + const hasGateThreshold = Object.values(gateThresholds).some((value) => typeof value === 'number'); + const gateResult = hasGateThreshold ? evaluateBackendCanaryGate(summary, gateThresholds) : undefined; + + const output = format === 'json' + ? JSON.stringify({ + generated_at: new Date().toISOString(), + event_count: events.length, + filters: { + since_ms: startTime, + until_ms: endTime, + }, + options: summaryOptions, + summary, + gate: gateResult, + }, null, 2) + : renderBackendCanaryMarkdown(summary, summaryOptions, gateResult); + + if (values.out) { + await writeFile(values.out, `${output}\n`, 'utf-8'); + } else { + process.stdout.write(`${output}\n`); + } +} + +main().catch((error) => { + const message = error instanceof Error ? error.message : String(error); + process.stderr.write(`${message}\n\n${usage()}\n`); + process.exitCode = 1; +}); diff --git a/src/audit/backendCanarySummary.test.ts b/src/audit/backendCanarySummary.test.ts new file mode 100644 index 0000000..1b5e834 --- /dev/null +++ b/src/audit/backendCanarySummary.test.ts @@ -0,0 +1,213 @@ +import { describe, expect, it } from 'vitest'; +import type { AuditEvent } from './types.js'; +import { + evaluateBackendCanaryGate, + renderBackendCanaryMarkdown, + summarizeBackendCanary, +} from './backendCanarySummary.js'; + +function makeEvent( + timestamp: number, + event_type: AuditEvent['event_type'], + event: Record, +): AuditEvent { + return { + timestamp, + level: 'info', + event_type, + event, + }; +} + +describe('summarizeBackendCanary', () => { + it('computes route, reliability, latency, and fallback summaries', () => { + const events: AuditEvent[] = [ + makeEvent(1000, 'backend.route', { + session_id: 'telegram:canary', + channel: 'telegram', + sender: '8367012007', + selected_backend: 'pi_embedded', + source: 'agent_override', + }), + makeEvent(1120, 'backend.success', { + session_id: 'telegram:canary', + channel: 'telegram', + sender: '8367012007', + backend: 'pi_embedded', + duration_ms: 120, + response_length: 50, + }), + makeEvent(1140, 'session.message', { + session_id: 'telegram:canary', + role: 'assistant', + content_length: 50, + }), + makeEvent(2000, 'backend.route', { + session_id: 'telegram:canary', + channel: 'telegram', + sender: '8367012007', + selected_backend: 'pi_embedded', + source: 'agent_override', + }), + makeEvent(2300, 'backend.fallback', { + session_id: 'telegram:canary', + channel: 'telegram', + sender: '8367012007', + from_backend: 'pi_embedded', + to_backend: 'native', + reason: 'request timed out waiting for backend process', + duration_ms: 300, + }), + makeEvent(2340, 'session.message', { + session_id: 'telegram:canary', + role: 'assistant', + content_length: 80, + }), + makeEvent(3000, 'backend.route', { + session_id: 'telegram:control', + channel: 'telegram', + sender: '123', + selected_backend: 'native', + source: 'native', + }), + makeEvent(3080, 'session.message', { + session_id: 'telegram:control', + role: 'assistant', + content_length: 25, + }), + ]; + + const summary = summarizeBackendCanary(events, { + targetBackend: 'pi_embedded', + baselineBackend: 'native', + }); + + expect(summary.route_stats.total).toBe(3); + expect(summary.route_stats.by_backend.pi_embedded).toBe(2); + expect(summary.route_stats.by_backend.native).toBe(1); + + expect(summary.target.routes).toBe(2); + expect(summary.target.completed_turns).toBe(2); + expect(summary.target.completion_rate_pct).toBe(100); + expect(summary.target.e2e_latency_ms?.p50_ms).toBe(240); + expect(summary.target.e2e_latency_ms?.p95_ms).toBe(330); + + expect(summary.baseline.routes).toBe(1); + expect(summary.baseline.completion_rate_pct).toBe(100); + expect(summary.baseline.e2e_latency_ms?.p50_ms).toBe(80); + + expect(summary.target_external_attempts?.attempts).toBe(2); + expect(summary.target_external_attempts?.successes).toBe(1); + expect(summary.target_external_attempts?.fallbacks).toBe(1); + expect(summary.target_external_attempts?.success_rate_pct).toBe(50); + expect(summary.target_external_attempts?.attempt_latency_ms?.p50_ms).toBe(210); + + expect(summary.comparison.p50_latency_delta_ms).toBe(160); + expect(summary.comparison.p95_latency_delta_ms).toBe(250); + + expect(summary.fallback_categories).toEqual([ + { category: 'timeout', count: 1, pct: 100 }, + ]); + expect(summary.fallback_top_reasons[0]?.reason).toContain('request timed out'); + }); + + it('filters routes by session id', () => { + const events: AuditEvent[] = [ + makeEvent(1000, 'backend.route', { + session_id: 'telegram:canary', + channel: 'telegram', + sender: '8367012007', + selected_backend: 'pi_embedded', + source: 'agent_override', + }), + makeEvent(1100, 'session.message', { + session_id: 'telegram:canary', + role: 'assistant', + content_length: 10, + }), + makeEvent(2000, 'backend.route', { + session_id: 'telegram:other', + channel: 'telegram', + sender: '9999', + selected_backend: 'native', + source: 'native', + }), + makeEvent(2100, 'session.message', { + session_id: 'telegram:other', + role: 'assistant', + content_length: 10, + }), + ]; + + const summary = summarizeBackendCanary(events, { + targetBackend: 'pi_embedded', + baselineBackend: 'native', + sessionIds: ['telegram:canary'], + }); + + expect(summary.route_stats.total).toBe(1); + expect(summary.target.routes).toBe(1); + expect(summary.baseline.routes).toBe(0); + }); +}); + +describe('evaluateBackendCanaryGate', () => { + it('evaluates configured pass/fail thresholds', () => { + const events: AuditEvent[] = [ + makeEvent(1000, 'backend.route', { + session_id: 's1', + channel: 'telegram', + sender: '1', + selected_backend: 'pi_embedded', + source: 'agent_override', + }), + makeEvent(1200, 'backend.success', { + session_id: 's1', + channel: 'telegram', + sender: '1', + backend: 'pi_embedded', + duration_ms: 200, + response_length: 10, + }), + makeEvent(1250, 'session.message', { + session_id: 's1', + role: 'assistant', + content_length: 20, + }), + makeEvent(2000, 'backend.route', { + session_id: 's2', + channel: 'telegram', + sender: '2', + selected_backend: 'native', + source: 'native', + }), + makeEvent(2050, 'session.message', { + session_id: 's2', + role: 'assistant', + content_length: 20, + }), + ]; + + const summary = summarizeBackendCanary(events, { + targetBackend: 'pi_embedded', + baselineBackend: 'native', + }); + + const gate = evaluateBackendCanaryGate(summary, { + maxCompletionRateDropPp: 0, + maxP50LatencyIncreaseMs: 300, + maxP95LatencyIncreaseMs: 300, + maxFallbackRatePct: 5, + }); + + expect(gate.pass).toBe(true); + expect(gate.criteria).toHaveLength(4); + + const markdown = renderBackendCanaryMarkdown(summary, { + targetBackend: 'pi_embedded', + baselineBackend: 'native', + }, gate); + expect(markdown).toContain('Pi Embedded Canary Summary'); + expect(markdown).toContain('Gate result: PASS'); + }); +}); diff --git a/src/audit/backendCanarySummary.ts b/src/audit/backendCanarySummary.ts new file mode 100644 index 0000000..8db7506 --- /dev/null +++ b/src/audit/backendCanarySummary.ts @@ -0,0 +1,618 @@ +import type { ExternalBackendName } from '../backends/external.js'; +import type { AuditEvent } from './types.js'; + +export type RoutedBackendName = 'native' | ExternalBackendName; + +export type BackendRouteSource = 'agent_override' | 'default_external' | 'native' | 'forced_native_guard'; + +export type BackendGuardReason = 'capability_query' | 'pi_no_tools_mode' | 'attachments_present'; + +export interface BackendCanarySummaryOptions { + targetBackend: RoutedBackendName; + baselineBackend: RoutedBackendName; + sessionIds?: string[]; + channels?: string[]; + senders?: string[]; + routeSources?: BackendRouteSource[]; +} + +export interface LatencyStats { + count: number; + avg_ms: number; + p50_ms: number; + p95_ms: number; + min_ms: number; + max_ms: number; +} + +export interface RouteStats { + total: number; + by_backend: Partial>; + by_source: Partial>; + forced_native_guards: Partial>; +} + +export interface BackendStats { + backend: RoutedBackendName; + routes: number; + completed_turns: number; + incomplete_turns: number; + completion_rate_pct: number | null; + e2e_latency_ms: LatencyStats | null; +} + +export interface ExternalBackendAttemptStats { + attempts: number; + successes: number; + fallbacks: number; + unresolved_attempts: number; + success_rate_pct: number | null; + attempt_latency_ms: LatencyStats | null; +} + +export interface FallbackReasonStats { + category: string; + count: number; + pct: number; +} + +export interface FallbackRawReasonStats { + reason: string; + count: number; +} + +export interface BackendComparisonStats { + completion_rate_delta_pp: number | null; + p50_latency_delta_ms: number | null; + p95_latency_delta_ms: number | null; +} + +export interface BackendCanarySummary { + route_stats: RouteStats; + target: BackendStats; + baseline: BackendStats; + target_external_attempts: ExternalBackendAttemptStats | null; + comparison: BackendComparisonStats; + fallback_categories: FallbackReasonStats[]; + fallback_top_reasons: FallbackRawReasonStats[]; +} + +export interface BackendCanaryGateThresholds { + maxCompletionRateDropPp?: number; + maxP50LatencyIncreaseMs?: number; + maxP95LatencyIncreaseMs?: number; + maxFallbackRatePct?: number; +} + +export interface BackendCanaryGateResult { + pass: boolean; + criteria: Array<{ criterion: string; pass: boolean; actual: string; threshold: string }>; +} + +interface RouteTurn { + sessionId: string; + channel: string; + sender: string; + selectedBackend: RoutedBackendName; + source: BackendRouteSource; + guardReason?: BackendGuardReason; + routeTimestamp: number; + externalOutcome?: 'success' | 'fallback'; + externalDurationMs?: number; + fallbackReason?: string; + assistantTimestamp?: number; +} + +function toRecord(value: unknown): Record { + return (value && typeof value === 'object') ? value as Record : {}; +} + +function readString(value: unknown): string | undefined { + return typeof value === 'string' ? value : undefined; +} + +function readNumber(value: unknown): number | undefined { + return typeof value === 'number' && Number.isFinite(value) ? value : undefined; +} + +function isRouteSource(value: unknown): value is BackendRouteSource { + return value === 'agent_override' || value === 'default_external' || value === 'native' || value === 'forced_native_guard'; +} + +function isGuardReason(value: unknown): value is BackendGuardReason { + return value === 'capability_query' || value === 'pi_no_tools_mode' || value === 'attachments_present'; +} + +function isRoutedBackendName(value: unknown): value is RoutedBackendName { + return value === 'native' + || value === 'claude_code' + || value === 'opencode' + || value === 'codex' + || value === 'gemini' + || value === 'pi_embedded'; +} + +function percentile(sortedAscending: number[], pct: number): number { + if (sortedAscending.length === 0) { + return 0; + } + if (sortedAscending.length === 1) { + return sortedAscending[0]; + } + const clampedPct = Math.max(0, Math.min(100, pct)); + const position = (clampedPct / 100) * (sortedAscending.length - 1); + const lowerIndex = Math.floor(position); + const upperIndex = Math.ceil(position); + if (lowerIndex === upperIndex) { + return sortedAscending[lowerIndex] ?? 0; + } + const lower = sortedAscending[lowerIndex] ?? 0; + const upper = sortedAscending[upperIndex] ?? 0; + const weight = position - lowerIndex; + return lower + ((upper - lower) * weight); +} + +function computeLatencyStats(samples: number[]): LatencyStats | null { + if (samples.length === 0) { + return null; + } + + const sorted = [...samples].sort((a, b) => a - b); + const total = sorted.reduce((sum, value) => sum + value, 0); + + return { + count: sorted.length, + avg_ms: Math.round(total / sorted.length), + p50_ms: Math.round(percentile(sorted, 50)), + p95_ms: Math.round(percentile(sorted, 95)), + min_ms: sorted[0] ?? 0, + max_ms: sorted[sorted.length - 1] ?? 0, + }; +} + +function toPct(part: number, whole: number): number | null { + if (whole <= 0) { + return null; + } + return Math.round((part / whole) * 10000) / 100; +} + +function normalizeFallbackCategory(reason: string): string { + const normalized = reason.trim().toLowerCase(); + if (!normalized) { + return 'unknown'; + } + if (normalized.includes('timeout') || normalized.includes('timed out')) { + return 'timeout'; + } + if (normalized.includes('abort') || normalized.includes('cancel')) { + return 'cancelled'; + } + if (normalized.includes('rate limit') || normalized.includes('429')) { + return 'rate_limit'; + } + if ( + normalized.includes('unauthorized') + || normalized.includes('forbidden') + || normalized.includes('401') + || normalized.includes('403') + || normalized.includes('api key') + ) { + return 'auth'; + } + if ( + normalized.includes('fetch failed') + || normalized.includes('network') + || normalized.includes('socket') + || normalized.includes('econn') + || normalized.includes('enotfound') + || normalized.includes('connect') + ) { + return 'network'; + } + if (normalized.includes('tool') || normalized.includes('capability')) { + return 'tool_or_capability'; + } + if (normalized.includes('json') || normalized.includes('parse') || normalized.includes('format')) { + return 'response_format'; + } + return normalized.slice(0, 80); +} + +function normalizeFallbackReasonForDisplay(reason: string): string { + const singleLine = reason.replace(/\s+/g, ' ').trim(); + if (!singleLine) { + return 'unknown'; + } + return singleLine.slice(0, 140); +} + +function buildBackendStats(backend: RoutedBackendName, turns: RouteTurn[]): BackendStats { + const completed = turns.filter((turn) => typeof turn.assistantTimestamp === 'number'); + const latencies = completed + .map((turn) => (turn.assistantTimestamp as number) - turn.routeTimestamp) + .filter((value) => value >= 0); + + return { + backend, + routes: turns.length, + completed_turns: completed.length, + incomplete_turns: turns.length - completed.length, + completion_rate_pct: toPct(completed.length, turns.length), + e2e_latency_ms: computeLatencyStats(latencies), + }; +} + +function buildAttemptStats(turns: RouteTurn[]): ExternalBackendAttemptStats { + const attempts = turns.filter((turn) => turn.externalOutcome !== undefined); + const successes = attempts.filter((turn) => turn.externalOutcome === 'success'); + const fallbacks = attempts.filter((turn) => turn.externalOutcome === 'fallback'); + const unresolved = turns.length - attempts.length; + const durationSamples = attempts + .map((turn) => turn.externalDurationMs) + .filter((value): value is number => typeof value === 'number' && value >= 0); + + return { + attempts: attempts.length, + successes: successes.length, + fallbacks: fallbacks.length, + unresolved_attempts: unresolved, + success_rate_pct: toPct(successes.length, attempts.length), + attempt_latency_ms: computeLatencyStats(durationSamples), + }; +} + +function sortFallbackReasonStats(categoryCounts: Map): FallbackReasonStats[] { + const total = [...categoryCounts.values()].reduce((sum, value) => sum + value, 0); + return [...categoryCounts.entries()] + .sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0])) + .map(([category, count]) => ({ + category, + count, + pct: total > 0 ? Math.round((count / total) * 10000) / 100 : 0, + })); +} + +function sortTopRawReasons(rawReasonCounts: Map, limit = 10): FallbackRawReasonStats[] { + return [...rawReasonCounts.entries()] + .sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0])) + .slice(0, limit) + .map(([reason, count]) => ({ reason, count })); +} + +export function summarizeBackendCanary(events: AuditEvent[], options: BackendCanarySummaryOptions): BackendCanarySummary { + const sessionFilter = new Set(options.sessionIds ?? []); + const channelFilter = new Set(options.channels ?? []); + const senderFilter = new Set(options.senders ?? []); + const sourceFilter = new Set(options.routeSources ?? []); + + const routeStats: RouteStats = { + total: 0, + by_backend: {}, + by_source: {}, + forced_native_guards: {}, + }; + + const turnQueues = new Map(); + const turns: RouteTurn[] = []; + const fallbackCategoryCounts = new Map(); + const fallbackRawReasonCounts = new Map(); + + const sortedEvents = [...events].sort((a, b) => a.timestamp - b.timestamp); + + for (const event of sortedEvents) { + const payload = toRecord(event.event); + + if (event.event_type === 'backend.route') { + const sessionId = readString(payload.session_id); + const channel = readString(payload.channel); + const sender = readString(payload.sender); + const selectedBackend = payload.selected_backend; + const source = payload.source; + const guardReason = payload.guard_reason; + + if (!sessionId || !channel || !sender || !isRoutedBackendName(selectedBackend) || !isRouteSource(source)) { + continue; + } + if (sessionFilter.size > 0 && !sessionFilter.has(sessionId)) { + continue; + } + if (channelFilter.size > 0 && !channelFilter.has(channel)) { + continue; + } + if (senderFilter.size > 0 && !senderFilter.has(sender)) { + continue; + } + if (sourceFilter.size > 0 && !sourceFilter.has(source)) { + continue; + } + + const turn: RouteTurn = { + sessionId, + channel, + sender, + selectedBackend, + source, + guardReason: isGuardReason(guardReason) ? guardReason : undefined, + routeTimestamp: event.timestamp, + }; + + routeStats.total += 1; + routeStats.by_backend[selectedBackend] = (routeStats.by_backend[selectedBackend] ?? 0) + 1; + routeStats.by_source[source] = (routeStats.by_source[source] ?? 0) + 1; + if (turn.guardReason) { + routeStats.forced_native_guards[turn.guardReason] = (routeStats.forced_native_guards[turn.guardReason] ?? 0) + 1; + } + + const queue = turnQueues.get(sessionId) ?? []; + queue.push(turn); + turnQueues.set(sessionId, queue); + turns.push(turn); + continue; + } + + const sessionId = readString(payload.session_id); + if (!sessionId) { + continue; + } + const queue = turnQueues.get(sessionId); + if (!queue || queue.length === 0) { + continue; + } + + if (event.event_type === 'backend.success') { + const backend = payload.backend; + const durationMs = readNumber(payload.duration_ms); + if (!isRoutedBackendName(backend) || backend === 'native') { + continue; + } + const turn = queue.find((candidate) => ( + candidate.selectedBackend === backend + && candidate.externalOutcome === undefined + )); + if (!turn) { + continue; + } + turn.externalOutcome = 'success'; + turn.externalDurationMs = durationMs; + continue; + } + + if (event.event_type === 'backend.fallback') { + const fromBackend = payload.from_backend; + const reason = readString(payload.reason) ?? 'unknown'; + const durationMs = readNumber(payload.duration_ms); + if (!isRoutedBackendName(fromBackend) || fromBackend === 'native') { + continue; + } + const turn = queue.find((candidate) => ( + candidate.selectedBackend === fromBackend + && candidate.externalOutcome === undefined + )); + if (!turn) { + continue; + } + turn.externalOutcome = 'fallback'; + turn.externalDurationMs = durationMs; + turn.fallbackReason = reason; + + const category = normalizeFallbackCategory(reason); + fallbackCategoryCounts.set(category, (fallbackCategoryCounts.get(category) ?? 0) + 1); + const normalizedReason = normalizeFallbackReasonForDisplay(reason); + fallbackRawReasonCounts.set(normalizedReason, (fallbackRawReasonCounts.get(normalizedReason) ?? 0) + 1); + continue; + } + + if (event.event_type === 'session.message') { + const role = readString(payload.role); + if (role !== 'assistant') { + continue; + } + const turn = queue[0]; + if (!turn) { + continue; + } + turn.assistantTimestamp = event.timestamp; + queue.shift(); + } + } + + const targetTurns = turns.filter((turn) => turn.selectedBackend === options.targetBackend); + const baselineTurns = turns.filter((turn) => turn.selectedBackend === options.baselineBackend); + + const targetStats = buildBackendStats(options.targetBackend, targetTurns); + const baselineStats = buildBackendStats(options.baselineBackend, baselineTurns); + + const comparison: BackendComparisonStats = { + completion_rate_delta_pp: ( + targetStats.completion_rate_pct !== null && baselineStats.completion_rate_pct !== null + ) + ? Math.round((targetStats.completion_rate_pct - baselineStats.completion_rate_pct) * 100) / 100 + : null, + p50_latency_delta_ms: ( + targetStats.e2e_latency_ms && baselineStats.e2e_latency_ms + ) + ? targetStats.e2e_latency_ms.p50_ms - baselineStats.e2e_latency_ms.p50_ms + : null, + p95_latency_delta_ms: ( + targetStats.e2e_latency_ms && baselineStats.e2e_latency_ms + ) + ? targetStats.e2e_latency_ms.p95_ms - baselineStats.e2e_latency_ms.p95_ms + : null, + }; + + const targetExternalAttempts = options.targetBackend === 'native' + ? null + : buildAttemptStats(targetTurns); + + return { + route_stats: routeStats, + target: targetStats, + baseline: baselineStats, + target_external_attempts: targetExternalAttempts, + comparison, + fallback_categories: sortFallbackReasonStats(fallbackCategoryCounts), + fallback_top_reasons: sortTopRawReasons(fallbackRawReasonCounts), + }; +} + +function formatPct(value: number | null): string { + return value === null ? 'n/a' : `${value.toFixed(2)}%`; +} + +function formatMs(value: number | null): string { + return value === null ? 'n/a' : `${value}ms`; +} + +export function evaluateBackendCanaryGate( + summary: BackendCanarySummary, + thresholds: BackendCanaryGateThresholds, +): BackendCanaryGateResult { + const criteria: BackendCanaryGateResult['criteria'] = []; + + if (typeof thresholds.maxCompletionRateDropPp === 'number') { + const delta = summary.comparison.completion_rate_delta_pp; + const pass = delta !== null && delta >= (-thresholds.maxCompletionRateDropPp); + criteria.push({ + criterion: 'Completion rate delta (target - baseline)', + pass, + actual: delta === null ? 'n/a' : `${delta.toFixed(2)}pp`, + threshold: `>= -${thresholds.maxCompletionRateDropPp.toFixed(2)}pp`, + }); + } + + if (typeof thresholds.maxP50LatencyIncreaseMs === 'number') { + const delta = summary.comparison.p50_latency_delta_ms; + const pass = delta !== null && delta <= thresholds.maxP50LatencyIncreaseMs; + criteria.push({ + criterion: 'P50 latency delta (target - baseline)', + pass, + actual: formatMs(delta), + threshold: `<= ${thresholds.maxP50LatencyIncreaseMs}ms`, + }); + } + + if (typeof thresholds.maxP95LatencyIncreaseMs === 'number') { + const delta = summary.comparison.p95_latency_delta_ms; + const pass = delta !== null && delta <= thresholds.maxP95LatencyIncreaseMs; + criteria.push({ + criterion: 'P95 latency delta (target - baseline)', + pass, + actual: formatMs(delta), + threshold: `<= ${thresholds.maxP95LatencyIncreaseMs}ms`, + }); + } + + if (typeof thresholds.maxFallbackRatePct === 'number') { + const fallbackRate = summary.target_external_attempts + ? toPct(summary.target_external_attempts.fallbacks, summary.target_external_attempts.attempts) + : null; + const pass = fallbackRate !== null && fallbackRate <= thresholds.maxFallbackRatePct; + criteria.push({ + criterion: 'Fallback rate (target external attempts)', + pass, + actual: formatPct(fallbackRate), + threshold: `<= ${thresholds.maxFallbackRatePct.toFixed(2)}%`, + }); + } + + return { + pass: criteria.length > 0 ? criteria.every((criterion) => criterion.pass) : true, + criteria, + }; +} + +function renderLatencyInline(label: string, stats: LatencyStats | null): string { + if (!stats) { + return `- ${label}: n/a`; + } + return `- ${label}: count=${stats.count}, avg=${stats.avg_ms}ms, p50=${stats.p50_ms}ms, p95=${stats.p95_ms}ms, min=${stats.min_ms}ms, max=${stats.max_ms}ms`; +} + +export function renderBackendCanaryMarkdown( + summary: BackendCanarySummary, + options: BackendCanarySummaryOptions, + gate?: BackendCanaryGateResult, +): string { + const lines: string[] = []; + + lines.push('# Pi Embedded Canary Summary'); + lines.push(''); + lines.push(`- Target backend: \`${options.targetBackend}\``); + lines.push(`- Baseline backend: \`${options.baselineBackend}\``); + lines.push(`- Routes analyzed: ${summary.route_stats.total}`); + lines.push(''); + + lines.push('## Route Distribution'); + lines.push(''); + lines.push('| Backend | Routes |'); + lines.push('| --- | ---: |'); + const backendRows = Object.entries(summary.route_stats.by_backend) + .sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0])); + for (const [backend, count] of backendRows) { + lines.push(`| ${backend} | ${count} |`); + } + if (backendRows.length === 0) { + lines.push('| _none_ | 0 |'); + } + lines.push(''); + + lines.push('## Reliability'); + lines.push(''); + lines.push('| Metric | Target | Baseline | Delta |'); + lines.push('| --- | ---: | ---: | ---: |'); + lines.push(`| Turn completion rate | ${formatPct(summary.target.completion_rate_pct)} | ${formatPct(summary.baseline.completion_rate_pct)} | ${summary.comparison.completion_rate_delta_pp === null ? 'n/a' : `${summary.comparison.completion_rate_delta_pp.toFixed(2)}pp`} |`); + if (summary.target_external_attempts) { + lines.push(`| External success rate | ${formatPct(summary.target_external_attempts.success_rate_pct)} | n/a | n/a |`); + lines.push(`| External attempts | ${summary.target_external_attempts.attempts} | n/a | n/a |`); + lines.push(`| External fallbacks | ${summary.target_external_attempts.fallbacks} | n/a | n/a |`); + } + lines.push(''); + + lines.push('## Latency'); + lines.push(''); + lines.push(renderLatencyInline('Target end-to-end', summary.target.e2e_latency_ms)); + lines.push(renderLatencyInline('Baseline end-to-end', summary.baseline.e2e_latency_ms)); + lines.push(`- P50 delta (target - baseline): ${formatMs(summary.comparison.p50_latency_delta_ms)}`); + lines.push(`- P95 delta (target - baseline): ${formatMs(summary.comparison.p95_latency_delta_ms)}`); + if (summary.target_external_attempts) { + lines.push(renderLatencyInline('Target external attempt', summary.target_external_attempts.attempt_latency_ms)); + } + lines.push(''); + + lines.push('## Fallback Taxonomy'); + lines.push(''); + lines.push('| Category | Count | Percent |'); + lines.push('| --- | ---: | ---: |'); + if (summary.fallback_categories.length === 0) { + lines.push('| _none_ | 0 | 0.00% |'); + } else { + for (const item of summary.fallback_categories) { + lines.push(`| ${item.category} | ${item.count} | ${item.pct.toFixed(2)}% |`); + } + } + lines.push(''); + + lines.push('## Top Fallback Reasons'); + lines.push(''); + if (summary.fallback_top_reasons.length === 0) { + lines.push('- none'); + } else { + for (const item of summary.fallback_top_reasons) { + lines.push(`- ${item.reason} (${item.count})`); + } + } + lines.push(''); + + if (gate) { + lines.push('## Gate Evaluation'); + lines.push(''); + lines.push(`- Gate result: ${gate.pass ? 'PASS' : 'HOLD'}`); + for (const criterion of gate.criteria) { + lines.push(`- [${criterion.pass ? 'x' : ' '}] ${criterion.criterion}: actual=${criterion.actual}, threshold=${criterion.threshold}`); + } + lines.push(''); + } + + return lines.join('\n'); +}