feat(audit): add pi canary summary analyzer and cli script
This commit is contained in:
@@ -0,0 +1,213 @@
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import type { AuditEvent } from './types.js';
|
||||
import {
|
||||
evaluateBackendCanaryGate,
|
||||
renderBackendCanaryMarkdown,
|
||||
summarizeBackendCanary,
|
||||
} from './backendCanarySummary.js';
|
||||
|
||||
function makeEvent(
|
||||
timestamp: number,
|
||||
event_type: AuditEvent['event_type'],
|
||||
event: Record<string, unknown>,
|
||||
): AuditEvent {
|
||||
return {
|
||||
timestamp,
|
||||
level: 'info',
|
||||
event_type,
|
||||
event,
|
||||
};
|
||||
}
|
||||
|
||||
describe('summarizeBackendCanary', () => {
|
||||
it('computes route, reliability, latency, and fallback summaries', () => {
|
||||
const events: AuditEvent[] = [
|
||||
makeEvent(1000, 'backend.route', {
|
||||
session_id: 'telegram:canary',
|
||||
channel: 'telegram',
|
||||
sender: '8367012007',
|
||||
selected_backend: 'pi_embedded',
|
||||
source: 'agent_override',
|
||||
}),
|
||||
makeEvent(1120, 'backend.success', {
|
||||
session_id: 'telegram:canary',
|
||||
channel: 'telegram',
|
||||
sender: '8367012007',
|
||||
backend: 'pi_embedded',
|
||||
duration_ms: 120,
|
||||
response_length: 50,
|
||||
}),
|
||||
makeEvent(1140, 'session.message', {
|
||||
session_id: 'telegram:canary',
|
||||
role: 'assistant',
|
||||
content_length: 50,
|
||||
}),
|
||||
makeEvent(2000, 'backend.route', {
|
||||
session_id: 'telegram:canary',
|
||||
channel: 'telegram',
|
||||
sender: '8367012007',
|
||||
selected_backend: 'pi_embedded',
|
||||
source: 'agent_override',
|
||||
}),
|
||||
makeEvent(2300, 'backend.fallback', {
|
||||
session_id: 'telegram:canary',
|
||||
channel: 'telegram',
|
||||
sender: '8367012007',
|
||||
from_backend: 'pi_embedded',
|
||||
to_backend: 'native',
|
||||
reason: 'request timed out waiting for backend process',
|
||||
duration_ms: 300,
|
||||
}),
|
||||
makeEvent(2340, 'session.message', {
|
||||
session_id: 'telegram:canary',
|
||||
role: 'assistant',
|
||||
content_length: 80,
|
||||
}),
|
||||
makeEvent(3000, 'backend.route', {
|
||||
session_id: 'telegram:control',
|
||||
channel: 'telegram',
|
||||
sender: '123',
|
||||
selected_backend: 'native',
|
||||
source: 'native',
|
||||
}),
|
||||
makeEvent(3080, 'session.message', {
|
||||
session_id: 'telegram:control',
|
||||
role: 'assistant',
|
||||
content_length: 25,
|
||||
}),
|
||||
];
|
||||
|
||||
const summary = summarizeBackendCanary(events, {
|
||||
targetBackend: 'pi_embedded',
|
||||
baselineBackend: 'native',
|
||||
});
|
||||
|
||||
expect(summary.route_stats.total).toBe(3);
|
||||
expect(summary.route_stats.by_backend.pi_embedded).toBe(2);
|
||||
expect(summary.route_stats.by_backend.native).toBe(1);
|
||||
|
||||
expect(summary.target.routes).toBe(2);
|
||||
expect(summary.target.completed_turns).toBe(2);
|
||||
expect(summary.target.completion_rate_pct).toBe(100);
|
||||
expect(summary.target.e2e_latency_ms?.p50_ms).toBe(240);
|
||||
expect(summary.target.e2e_latency_ms?.p95_ms).toBe(330);
|
||||
|
||||
expect(summary.baseline.routes).toBe(1);
|
||||
expect(summary.baseline.completion_rate_pct).toBe(100);
|
||||
expect(summary.baseline.e2e_latency_ms?.p50_ms).toBe(80);
|
||||
|
||||
expect(summary.target_external_attempts?.attempts).toBe(2);
|
||||
expect(summary.target_external_attempts?.successes).toBe(1);
|
||||
expect(summary.target_external_attempts?.fallbacks).toBe(1);
|
||||
expect(summary.target_external_attempts?.success_rate_pct).toBe(50);
|
||||
expect(summary.target_external_attempts?.attempt_latency_ms?.p50_ms).toBe(210);
|
||||
|
||||
expect(summary.comparison.p50_latency_delta_ms).toBe(160);
|
||||
expect(summary.comparison.p95_latency_delta_ms).toBe(250);
|
||||
|
||||
expect(summary.fallback_categories).toEqual([
|
||||
{ category: 'timeout', count: 1, pct: 100 },
|
||||
]);
|
||||
expect(summary.fallback_top_reasons[0]?.reason).toContain('request timed out');
|
||||
});
|
||||
|
||||
it('filters routes by session id', () => {
|
||||
const events: AuditEvent[] = [
|
||||
makeEvent(1000, 'backend.route', {
|
||||
session_id: 'telegram:canary',
|
||||
channel: 'telegram',
|
||||
sender: '8367012007',
|
||||
selected_backend: 'pi_embedded',
|
||||
source: 'agent_override',
|
||||
}),
|
||||
makeEvent(1100, 'session.message', {
|
||||
session_id: 'telegram:canary',
|
||||
role: 'assistant',
|
||||
content_length: 10,
|
||||
}),
|
||||
makeEvent(2000, 'backend.route', {
|
||||
session_id: 'telegram:other',
|
||||
channel: 'telegram',
|
||||
sender: '9999',
|
||||
selected_backend: 'native',
|
||||
source: 'native',
|
||||
}),
|
||||
makeEvent(2100, 'session.message', {
|
||||
session_id: 'telegram:other',
|
||||
role: 'assistant',
|
||||
content_length: 10,
|
||||
}),
|
||||
];
|
||||
|
||||
const summary = summarizeBackendCanary(events, {
|
||||
targetBackend: 'pi_embedded',
|
||||
baselineBackend: 'native',
|
||||
sessionIds: ['telegram:canary'],
|
||||
});
|
||||
|
||||
expect(summary.route_stats.total).toBe(1);
|
||||
expect(summary.target.routes).toBe(1);
|
||||
expect(summary.baseline.routes).toBe(0);
|
||||
});
|
||||
});
|
||||
|
||||
describe('evaluateBackendCanaryGate', () => {
|
||||
it('evaluates configured pass/fail thresholds', () => {
|
||||
const events: AuditEvent[] = [
|
||||
makeEvent(1000, 'backend.route', {
|
||||
session_id: 's1',
|
||||
channel: 'telegram',
|
||||
sender: '1',
|
||||
selected_backend: 'pi_embedded',
|
||||
source: 'agent_override',
|
||||
}),
|
||||
makeEvent(1200, 'backend.success', {
|
||||
session_id: 's1',
|
||||
channel: 'telegram',
|
||||
sender: '1',
|
||||
backend: 'pi_embedded',
|
||||
duration_ms: 200,
|
||||
response_length: 10,
|
||||
}),
|
||||
makeEvent(1250, 'session.message', {
|
||||
session_id: 's1',
|
||||
role: 'assistant',
|
||||
content_length: 20,
|
||||
}),
|
||||
makeEvent(2000, 'backend.route', {
|
||||
session_id: 's2',
|
||||
channel: 'telegram',
|
||||
sender: '2',
|
||||
selected_backend: 'native',
|
||||
source: 'native',
|
||||
}),
|
||||
makeEvent(2050, 'session.message', {
|
||||
session_id: 's2',
|
||||
role: 'assistant',
|
||||
content_length: 20,
|
||||
}),
|
||||
];
|
||||
|
||||
const summary = summarizeBackendCanary(events, {
|
||||
targetBackend: 'pi_embedded',
|
||||
baselineBackend: 'native',
|
||||
});
|
||||
|
||||
const gate = evaluateBackendCanaryGate(summary, {
|
||||
maxCompletionRateDropPp: 0,
|
||||
maxP50LatencyIncreaseMs: 300,
|
||||
maxP95LatencyIncreaseMs: 300,
|
||||
maxFallbackRatePct: 5,
|
||||
});
|
||||
|
||||
expect(gate.pass).toBe(true);
|
||||
expect(gate.criteria).toHaveLength(4);
|
||||
|
||||
const markdown = renderBackendCanaryMarkdown(summary, {
|
||||
targetBackend: 'pi_embedded',
|
||||
baselineBackend: 'native',
|
||||
}, gate);
|
||||
expect(markdown).toContain('Pi Embedded Canary Summary');
|
||||
expect(markdown).toContain('Gate result: PASS');
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,618 @@
|
||||
import type { ExternalBackendName } from '../backends/external.js';
|
||||
import type { AuditEvent } from './types.js';
|
||||
|
||||
export type RoutedBackendName = 'native' | ExternalBackendName;
|
||||
|
||||
export type BackendRouteSource = 'agent_override' | 'default_external' | 'native' | 'forced_native_guard';
|
||||
|
||||
export type BackendGuardReason = 'capability_query' | 'pi_no_tools_mode' | 'attachments_present';
|
||||
|
||||
export interface BackendCanarySummaryOptions {
|
||||
targetBackend: RoutedBackendName;
|
||||
baselineBackend: RoutedBackendName;
|
||||
sessionIds?: string[];
|
||||
channels?: string[];
|
||||
senders?: string[];
|
||||
routeSources?: BackendRouteSource[];
|
||||
}
|
||||
|
||||
export interface LatencyStats {
|
||||
count: number;
|
||||
avg_ms: number;
|
||||
p50_ms: number;
|
||||
p95_ms: number;
|
||||
min_ms: number;
|
||||
max_ms: number;
|
||||
}
|
||||
|
||||
export interface RouteStats {
|
||||
total: number;
|
||||
by_backend: Partial<Record<RoutedBackendName, number>>;
|
||||
by_source: Partial<Record<BackendRouteSource, number>>;
|
||||
forced_native_guards: Partial<Record<BackendGuardReason, number>>;
|
||||
}
|
||||
|
||||
export interface BackendStats {
|
||||
backend: RoutedBackendName;
|
||||
routes: number;
|
||||
completed_turns: number;
|
||||
incomplete_turns: number;
|
||||
completion_rate_pct: number | null;
|
||||
e2e_latency_ms: LatencyStats | null;
|
||||
}
|
||||
|
||||
export interface ExternalBackendAttemptStats {
|
||||
attempts: number;
|
||||
successes: number;
|
||||
fallbacks: number;
|
||||
unresolved_attempts: number;
|
||||
success_rate_pct: number | null;
|
||||
attempt_latency_ms: LatencyStats | null;
|
||||
}
|
||||
|
||||
export interface FallbackReasonStats {
|
||||
category: string;
|
||||
count: number;
|
||||
pct: number;
|
||||
}
|
||||
|
||||
export interface FallbackRawReasonStats {
|
||||
reason: string;
|
||||
count: number;
|
||||
}
|
||||
|
||||
export interface BackendComparisonStats {
|
||||
completion_rate_delta_pp: number | null;
|
||||
p50_latency_delta_ms: number | null;
|
||||
p95_latency_delta_ms: number | null;
|
||||
}
|
||||
|
||||
export interface BackendCanarySummary {
|
||||
route_stats: RouteStats;
|
||||
target: BackendStats;
|
||||
baseline: BackendStats;
|
||||
target_external_attempts: ExternalBackendAttemptStats | null;
|
||||
comparison: BackendComparisonStats;
|
||||
fallback_categories: FallbackReasonStats[];
|
||||
fallback_top_reasons: FallbackRawReasonStats[];
|
||||
}
|
||||
|
||||
export interface BackendCanaryGateThresholds {
|
||||
maxCompletionRateDropPp?: number;
|
||||
maxP50LatencyIncreaseMs?: number;
|
||||
maxP95LatencyIncreaseMs?: number;
|
||||
maxFallbackRatePct?: number;
|
||||
}
|
||||
|
||||
export interface BackendCanaryGateResult {
|
||||
pass: boolean;
|
||||
criteria: Array<{ criterion: string; pass: boolean; actual: string; threshold: string }>;
|
||||
}
|
||||
|
||||
interface RouteTurn {
|
||||
sessionId: string;
|
||||
channel: string;
|
||||
sender: string;
|
||||
selectedBackend: RoutedBackendName;
|
||||
source: BackendRouteSource;
|
||||
guardReason?: BackendGuardReason;
|
||||
routeTimestamp: number;
|
||||
externalOutcome?: 'success' | 'fallback';
|
||||
externalDurationMs?: number;
|
||||
fallbackReason?: string;
|
||||
assistantTimestamp?: number;
|
||||
}
|
||||
|
||||
function toRecord(value: unknown): Record<string, unknown> {
|
||||
return (value && typeof value === 'object') ? value as Record<string, unknown> : {};
|
||||
}
|
||||
|
||||
function readString(value: unknown): string | undefined {
|
||||
return typeof value === 'string' ? value : undefined;
|
||||
}
|
||||
|
||||
function readNumber(value: unknown): number | undefined {
|
||||
return typeof value === 'number' && Number.isFinite(value) ? value : undefined;
|
||||
}
|
||||
|
||||
function isRouteSource(value: unknown): value is BackendRouteSource {
|
||||
return value === 'agent_override' || value === 'default_external' || value === 'native' || value === 'forced_native_guard';
|
||||
}
|
||||
|
||||
function isGuardReason(value: unknown): value is BackendGuardReason {
|
||||
return value === 'capability_query' || value === 'pi_no_tools_mode' || value === 'attachments_present';
|
||||
}
|
||||
|
||||
function isRoutedBackendName(value: unknown): value is RoutedBackendName {
|
||||
return value === 'native'
|
||||
|| value === 'claude_code'
|
||||
|| value === 'opencode'
|
||||
|| value === 'codex'
|
||||
|| value === 'gemini'
|
||||
|| value === 'pi_embedded';
|
||||
}
|
||||
|
||||
function percentile(sortedAscending: number[], pct: number): number {
|
||||
if (sortedAscending.length === 0) {
|
||||
return 0;
|
||||
}
|
||||
if (sortedAscending.length === 1) {
|
||||
return sortedAscending[0];
|
||||
}
|
||||
const clampedPct = Math.max(0, Math.min(100, pct));
|
||||
const position = (clampedPct / 100) * (sortedAscending.length - 1);
|
||||
const lowerIndex = Math.floor(position);
|
||||
const upperIndex = Math.ceil(position);
|
||||
if (lowerIndex === upperIndex) {
|
||||
return sortedAscending[lowerIndex] ?? 0;
|
||||
}
|
||||
const lower = sortedAscending[lowerIndex] ?? 0;
|
||||
const upper = sortedAscending[upperIndex] ?? 0;
|
||||
const weight = position - lowerIndex;
|
||||
return lower + ((upper - lower) * weight);
|
||||
}
|
||||
|
||||
function computeLatencyStats(samples: number[]): LatencyStats | null {
|
||||
if (samples.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const sorted = [...samples].sort((a, b) => a - b);
|
||||
const total = sorted.reduce((sum, value) => sum + value, 0);
|
||||
|
||||
return {
|
||||
count: sorted.length,
|
||||
avg_ms: Math.round(total / sorted.length),
|
||||
p50_ms: Math.round(percentile(sorted, 50)),
|
||||
p95_ms: Math.round(percentile(sorted, 95)),
|
||||
min_ms: sorted[0] ?? 0,
|
||||
max_ms: sorted[sorted.length - 1] ?? 0,
|
||||
};
|
||||
}
|
||||
|
||||
function toPct(part: number, whole: number): number | null {
|
||||
if (whole <= 0) {
|
||||
return null;
|
||||
}
|
||||
return Math.round((part / whole) * 10000) / 100;
|
||||
}
|
||||
|
||||
function normalizeFallbackCategory(reason: string): string {
|
||||
const normalized = reason.trim().toLowerCase();
|
||||
if (!normalized) {
|
||||
return 'unknown';
|
||||
}
|
||||
if (normalized.includes('timeout') || normalized.includes('timed out')) {
|
||||
return 'timeout';
|
||||
}
|
||||
if (normalized.includes('abort') || normalized.includes('cancel')) {
|
||||
return 'cancelled';
|
||||
}
|
||||
if (normalized.includes('rate limit') || normalized.includes('429')) {
|
||||
return 'rate_limit';
|
||||
}
|
||||
if (
|
||||
normalized.includes('unauthorized')
|
||||
|| normalized.includes('forbidden')
|
||||
|| normalized.includes('401')
|
||||
|| normalized.includes('403')
|
||||
|| normalized.includes('api key')
|
||||
) {
|
||||
return 'auth';
|
||||
}
|
||||
if (
|
||||
normalized.includes('fetch failed')
|
||||
|| normalized.includes('network')
|
||||
|| normalized.includes('socket')
|
||||
|| normalized.includes('econn')
|
||||
|| normalized.includes('enotfound')
|
||||
|| normalized.includes('connect')
|
||||
) {
|
||||
return 'network';
|
||||
}
|
||||
if (normalized.includes('tool') || normalized.includes('capability')) {
|
||||
return 'tool_or_capability';
|
||||
}
|
||||
if (normalized.includes('json') || normalized.includes('parse') || normalized.includes('format')) {
|
||||
return 'response_format';
|
||||
}
|
||||
return normalized.slice(0, 80);
|
||||
}
|
||||
|
||||
function normalizeFallbackReasonForDisplay(reason: string): string {
|
||||
const singleLine = reason.replace(/\s+/g, ' ').trim();
|
||||
if (!singleLine) {
|
||||
return 'unknown';
|
||||
}
|
||||
return singleLine.slice(0, 140);
|
||||
}
|
||||
|
||||
function buildBackendStats(backend: RoutedBackendName, turns: RouteTurn[]): BackendStats {
|
||||
const completed = turns.filter((turn) => typeof turn.assistantTimestamp === 'number');
|
||||
const latencies = completed
|
||||
.map((turn) => (turn.assistantTimestamp as number) - turn.routeTimestamp)
|
||||
.filter((value) => value >= 0);
|
||||
|
||||
return {
|
||||
backend,
|
||||
routes: turns.length,
|
||||
completed_turns: completed.length,
|
||||
incomplete_turns: turns.length - completed.length,
|
||||
completion_rate_pct: toPct(completed.length, turns.length),
|
||||
e2e_latency_ms: computeLatencyStats(latencies),
|
||||
};
|
||||
}
|
||||
|
||||
function buildAttemptStats(turns: RouteTurn[]): ExternalBackendAttemptStats {
|
||||
const attempts = turns.filter((turn) => turn.externalOutcome !== undefined);
|
||||
const successes = attempts.filter((turn) => turn.externalOutcome === 'success');
|
||||
const fallbacks = attempts.filter((turn) => turn.externalOutcome === 'fallback');
|
||||
const unresolved = turns.length - attempts.length;
|
||||
const durationSamples = attempts
|
||||
.map((turn) => turn.externalDurationMs)
|
||||
.filter((value): value is number => typeof value === 'number' && value >= 0);
|
||||
|
||||
return {
|
||||
attempts: attempts.length,
|
||||
successes: successes.length,
|
||||
fallbacks: fallbacks.length,
|
||||
unresolved_attempts: unresolved,
|
||||
success_rate_pct: toPct(successes.length, attempts.length),
|
||||
attempt_latency_ms: computeLatencyStats(durationSamples),
|
||||
};
|
||||
}
|
||||
|
||||
function sortFallbackReasonStats(categoryCounts: Map<string, number>): FallbackReasonStats[] {
|
||||
const total = [...categoryCounts.values()].reduce((sum, value) => sum + value, 0);
|
||||
return [...categoryCounts.entries()]
|
||||
.sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]))
|
||||
.map(([category, count]) => ({
|
||||
category,
|
||||
count,
|
||||
pct: total > 0 ? Math.round((count / total) * 10000) / 100 : 0,
|
||||
}));
|
||||
}
|
||||
|
||||
function sortTopRawReasons(rawReasonCounts: Map<string, number>, limit = 10): FallbackRawReasonStats[] {
|
||||
return [...rawReasonCounts.entries()]
|
||||
.sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]))
|
||||
.slice(0, limit)
|
||||
.map(([reason, count]) => ({ reason, count }));
|
||||
}
|
||||
|
||||
export function summarizeBackendCanary(events: AuditEvent[], options: BackendCanarySummaryOptions): BackendCanarySummary {
|
||||
const sessionFilter = new Set(options.sessionIds ?? []);
|
||||
const channelFilter = new Set(options.channels ?? []);
|
||||
const senderFilter = new Set(options.senders ?? []);
|
||||
const sourceFilter = new Set(options.routeSources ?? []);
|
||||
|
||||
const routeStats: RouteStats = {
|
||||
total: 0,
|
||||
by_backend: {},
|
||||
by_source: {},
|
||||
forced_native_guards: {},
|
||||
};
|
||||
|
||||
const turnQueues = new Map<string, RouteTurn[]>();
|
||||
const turns: RouteTurn[] = [];
|
||||
const fallbackCategoryCounts = new Map<string, number>();
|
||||
const fallbackRawReasonCounts = new Map<string, number>();
|
||||
|
||||
const sortedEvents = [...events].sort((a, b) => a.timestamp - b.timestamp);
|
||||
|
||||
for (const event of sortedEvents) {
|
||||
const payload = toRecord(event.event);
|
||||
|
||||
if (event.event_type === 'backend.route') {
|
||||
const sessionId = readString(payload.session_id);
|
||||
const channel = readString(payload.channel);
|
||||
const sender = readString(payload.sender);
|
||||
const selectedBackend = payload.selected_backend;
|
||||
const source = payload.source;
|
||||
const guardReason = payload.guard_reason;
|
||||
|
||||
if (!sessionId || !channel || !sender || !isRoutedBackendName(selectedBackend) || !isRouteSource(source)) {
|
||||
continue;
|
||||
}
|
||||
if (sessionFilter.size > 0 && !sessionFilter.has(sessionId)) {
|
||||
continue;
|
||||
}
|
||||
if (channelFilter.size > 0 && !channelFilter.has(channel)) {
|
||||
continue;
|
||||
}
|
||||
if (senderFilter.size > 0 && !senderFilter.has(sender)) {
|
||||
continue;
|
||||
}
|
||||
if (sourceFilter.size > 0 && !sourceFilter.has(source)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const turn: RouteTurn = {
|
||||
sessionId,
|
||||
channel,
|
||||
sender,
|
||||
selectedBackend,
|
||||
source,
|
||||
guardReason: isGuardReason(guardReason) ? guardReason : undefined,
|
||||
routeTimestamp: event.timestamp,
|
||||
};
|
||||
|
||||
routeStats.total += 1;
|
||||
routeStats.by_backend[selectedBackend] = (routeStats.by_backend[selectedBackend] ?? 0) + 1;
|
||||
routeStats.by_source[source] = (routeStats.by_source[source] ?? 0) + 1;
|
||||
if (turn.guardReason) {
|
||||
routeStats.forced_native_guards[turn.guardReason] = (routeStats.forced_native_guards[turn.guardReason] ?? 0) + 1;
|
||||
}
|
||||
|
||||
const queue = turnQueues.get(sessionId) ?? [];
|
||||
queue.push(turn);
|
||||
turnQueues.set(sessionId, queue);
|
||||
turns.push(turn);
|
||||
continue;
|
||||
}
|
||||
|
||||
const sessionId = readString(payload.session_id);
|
||||
if (!sessionId) {
|
||||
continue;
|
||||
}
|
||||
const queue = turnQueues.get(sessionId);
|
||||
if (!queue || queue.length === 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (event.event_type === 'backend.success') {
|
||||
const backend = payload.backend;
|
||||
const durationMs = readNumber(payload.duration_ms);
|
||||
if (!isRoutedBackendName(backend) || backend === 'native') {
|
||||
continue;
|
||||
}
|
||||
const turn = queue.find((candidate) => (
|
||||
candidate.selectedBackend === backend
|
||||
&& candidate.externalOutcome === undefined
|
||||
));
|
||||
if (!turn) {
|
||||
continue;
|
||||
}
|
||||
turn.externalOutcome = 'success';
|
||||
turn.externalDurationMs = durationMs;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (event.event_type === 'backend.fallback') {
|
||||
const fromBackend = payload.from_backend;
|
||||
const reason = readString(payload.reason) ?? 'unknown';
|
||||
const durationMs = readNumber(payload.duration_ms);
|
||||
if (!isRoutedBackendName(fromBackend) || fromBackend === 'native') {
|
||||
continue;
|
||||
}
|
||||
const turn = queue.find((candidate) => (
|
||||
candidate.selectedBackend === fromBackend
|
||||
&& candidate.externalOutcome === undefined
|
||||
));
|
||||
if (!turn) {
|
||||
continue;
|
||||
}
|
||||
turn.externalOutcome = 'fallback';
|
||||
turn.externalDurationMs = durationMs;
|
||||
turn.fallbackReason = reason;
|
||||
|
||||
const category = normalizeFallbackCategory(reason);
|
||||
fallbackCategoryCounts.set(category, (fallbackCategoryCounts.get(category) ?? 0) + 1);
|
||||
const normalizedReason = normalizeFallbackReasonForDisplay(reason);
|
||||
fallbackRawReasonCounts.set(normalizedReason, (fallbackRawReasonCounts.get(normalizedReason) ?? 0) + 1);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (event.event_type === 'session.message') {
|
||||
const role = readString(payload.role);
|
||||
if (role !== 'assistant') {
|
||||
continue;
|
||||
}
|
||||
const turn = queue[0];
|
||||
if (!turn) {
|
||||
continue;
|
||||
}
|
||||
turn.assistantTimestamp = event.timestamp;
|
||||
queue.shift();
|
||||
}
|
||||
}
|
||||
|
||||
const targetTurns = turns.filter((turn) => turn.selectedBackend === options.targetBackend);
|
||||
const baselineTurns = turns.filter((turn) => turn.selectedBackend === options.baselineBackend);
|
||||
|
||||
const targetStats = buildBackendStats(options.targetBackend, targetTurns);
|
||||
const baselineStats = buildBackendStats(options.baselineBackend, baselineTurns);
|
||||
|
||||
const comparison: BackendComparisonStats = {
|
||||
completion_rate_delta_pp: (
|
||||
targetStats.completion_rate_pct !== null && baselineStats.completion_rate_pct !== null
|
||||
)
|
||||
? Math.round((targetStats.completion_rate_pct - baselineStats.completion_rate_pct) * 100) / 100
|
||||
: null,
|
||||
p50_latency_delta_ms: (
|
||||
targetStats.e2e_latency_ms && baselineStats.e2e_latency_ms
|
||||
)
|
||||
? targetStats.e2e_latency_ms.p50_ms - baselineStats.e2e_latency_ms.p50_ms
|
||||
: null,
|
||||
p95_latency_delta_ms: (
|
||||
targetStats.e2e_latency_ms && baselineStats.e2e_latency_ms
|
||||
)
|
||||
? targetStats.e2e_latency_ms.p95_ms - baselineStats.e2e_latency_ms.p95_ms
|
||||
: null,
|
||||
};
|
||||
|
||||
const targetExternalAttempts = options.targetBackend === 'native'
|
||||
? null
|
||||
: buildAttemptStats(targetTurns);
|
||||
|
||||
return {
|
||||
route_stats: routeStats,
|
||||
target: targetStats,
|
||||
baseline: baselineStats,
|
||||
target_external_attempts: targetExternalAttempts,
|
||||
comparison,
|
||||
fallback_categories: sortFallbackReasonStats(fallbackCategoryCounts),
|
||||
fallback_top_reasons: sortTopRawReasons(fallbackRawReasonCounts),
|
||||
};
|
||||
}
|
||||
|
||||
function formatPct(value: number | null): string {
|
||||
return value === null ? 'n/a' : `${value.toFixed(2)}%`;
|
||||
}
|
||||
|
||||
function formatMs(value: number | null): string {
|
||||
return value === null ? 'n/a' : `${value}ms`;
|
||||
}
|
||||
|
||||
export function evaluateBackendCanaryGate(
|
||||
summary: BackendCanarySummary,
|
||||
thresholds: BackendCanaryGateThresholds,
|
||||
): BackendCanaryGateResult {
|
||||
const criteria: BackendCanaryGateResult['criteria'] = [];
|
||||
|
||||
if (typeof thresholds.maxCompletionRateDropPp === 'number') {
|
||||
const delta = summary.comparison.completion_rate_delta_pp;
|
||||
const pass = delta !== null && delta >= (-thresholds.maxCompletionRateDropPp);
|
||||
criteria.push({
|
||||
criterion: 'Completion rate delta (target - baseline)',
|
||||
pass,
|
||||
actual: delta === null ? 'n/a' : `${delta.toFixed(2)}pp`,
|
||||
threshold: `>= -${thresholds.maxCompletionRateDropPp.toFixed(2)}pp`,
|
||||
});
|
||||
}
|
||||
|
||||
if (typeof thresholds.maxP50LatencyIncreaseMs === 'number') {
|
||||
const delta = summary.comparison.p50_latency_delta_ms;
|
||||
const pass = delta !== null && delta <= thresholds.maxP50LatencyIncreaseMs;
|
||||
criteria.push({
|
||||
criterion: 'P50 latency delta (target - baseline)',
|
||||
pass,
|
||||
actual: formatMs(delta),
|
||||
threshold: `<= ${thresholds.maxP50LatencyIncreaseMs}ms`,
|
||||
});
|
||||
}
|
||||
|
||||
if (typeof thresholds.maxP95LatencyIncreaseMs === 'number') {
|
||||
const delta = summary.comparison.p95_latency_delta_ms;
|
||||
const pass = delta !== null && delta <= thresholds.maxP95LatencyIncreaseMs;
|
||||
criteria.push({
|
||||
criterion: 'P95 latency delta (target - baseline)',
|
||||
pass,
|
||||
actual: formatMs(delta),
|
||||
threshold: `<= ${thresholds.maxP95LatencyIncreaseMs}ms`,
|
||||
});
|
||||
}
|
||||
|
||||
if (typeof thresholds.maxFallbackRatePct === 'number') {
|
||||
const fallbackRate = summary.target_external_attempts
|
||||
? toPct(summary.target_external_attempts.fallbacks, summary.target_external_attempts.attempts)
|
||||
: null;
|
||||
const pass = fallbackRate !== null && fallbackRate <= thresholds.maxFallbackRatePct;
|
||||
criteria.push({
|
||||
criterion: 'Fallback rate (target external attempts)',
|
||||
pass,
|
||||
actual: formatPct(fallbackRate),
|
||||
threshold: `<= ${thresholds.maxFallbackRatePct.toFixed(2)}%`,
|
||||
});
|
||||
}
|
||||
|
||||
return {
|
||||
pass: criteria.length > 0 ? criteria.every((criterion) => criterion.pass) : true,
|
||||
criteria,
|
||||
};
|
||||
}
|
||||
|
||||
function renderLatencyInline(label: string, stats: LatencyStats | null): string {
|
||||
if (!stats) {
|
||||
return `- ${label}: n/a`;
|
||||
}
|
||||
return `- ${label}: count=${stats.count}, avg=${stats.avg_ms}ms, p50=${stats.p50_ms}ms, p95=${stats.p95_ms}ms, min=${stats.min_ms}ms, max=${stats.max_ms}ms`;
|
||||
}
|
||||
|
||||
export function renderBackendCanaryMarkdown(
|
||||
summary: BackendCanarySummary,
|
||||
options: BackendCanarySummaryOptions,
|
||||
gate?: BackendCanaryGateResult,
|
||||
): string {
|
||||
const lines: string[] = [];
|
||||
|
||||
lines.push('# Pi Embedded Canary Summary');
|
||||
lines.push('');
|
||||
lines.push(`- Target backend: \`${options.targetBackend}\``);
|
||||
lines.push(`- Baseline backend: \`${options.baselineBackend}\``);
|
||||
lines.push(`- Routes analyzed: ${summary.route_stats.total}`);
|
||||
lines.push('');
|
||||
|
||||
lines.push('## Route Distribution');
|
||||
lines.push('');
|
||||
lines.push('| Backend | Routes |');
|
||||
lines.push('| --- | ---: |');
|
||||
const backendRows = Object.entries(summary.route_stats.by_backend)
|
||||
.sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]));
|
||||
for (const [backend, count] of backendRows) {
|
||||
lines.push(`| ${backend} | ${count} |`);
|
||||
}
|
||||
if (backendRows.length === 0) {
|
||||
lines.push('| _none_ | 0 |');
|
||||
}
|
||||
lines.push('');
|
||||
|
||||
lines.push('## Reliability');
|
||||
lines.push('');
|
||||
lines.push('| Metric | Target | Baseline | Delta |');
|
||||
lines.push('| --- | ---: | ---: | ---: |');
|
||||
lines.push(`| Turn completion rate | ${formatPct(summary.target.completion_rate_pct)} | ${formatPct(summary.baseline.completion_rate_pct)} | ${summary.comparison.completion_rate_delta_pp === null ? 'n/a' : `${summary.comparison.completion_rate_delta_pp.toFixed(2)}pp`} |`);
|
||||
if (summary.target_external_attempts) {
|
||||
lines.push(`| External success rate | ${formatPct(summary.target_external_attempts.success_rate_pct)} | n/a | n/a |`);
|
||||
lines.push(`| External attempts | ${summary.target_external_attempts.attempts} | n/a | n/a |`);
|
||||
lines.push(`| External fallbacks | ${summary.target_external_attempts.fallbacks} | n/a | n/a |`);
|
||||
}
|
||||
lines.push('');
|
||||
|
||||
lines.push('## Latency');
|
||||
lines.push('');
|
||||
lines.push(renderLatencyInline('Target end-to-end', summary.target.e2e_latency_ms));
|
||||
lines.push(renderLatencyInline('Baseline end-to-end', summary.baseline.e2e_latency_ms));
|
||||
lines.push(`- P50 delta (target - baseline): ${formatMs(summary.comparison.p50_latency_delta_ms)}`);
|
||||
lines.push(`- P95 delta (target - baseline): ${formatMs(summary.comparison.p95_latency_delta_ms)}`);
|
||||
if (summary.target_external_attempts) {
|
||||
lines.push(renderLatencyInline('Target external attempt', summary.target_external_attempts.attempt_latency_ms));
|
||||
}
|
||||
lines.push('');
|
||||
|
||||
lines.push('## Fallback Taxonomy');
|
||||
lines.push('');
|
||||
lines.push('| Category | Count | Percent |');
|
||||
lines.push('| --- | ---: | ---: |');
|
||||
if (summary.fallback_categories.length === 0) {
|
||||
lines.push('| _none_ | 0 | 0.00% |');
|
||||
} else {
|
||||
for (const item of summary.fallback_categories) {
|
||||
lines.push(`| ${item.category} | ${item.count} | ${item.pct.toFixed(2)}% |`);
|
||||
}
|
||||
}
|
||||
lines.push('');
|
||||
|
||||
lines.push('## Top Fallback Reasons');
|
||||
lines.push('');
|
||||
if (summary.fallback_top_reasons.length === 0) {
|
||||
lines.push('- none');
|
||||
} else {
|
||||
for (const item of summary.fallback_top_reasons) {
|
||||
lines.push(`- ${item.reason} (${item.count})`);
|
||||
}
|
||||
}
|
||||
lines.push('');
|
||||
|
||||
if (gate) {
|
||||
lines.push('## Gate Evaluation');
|
||||
lines.push('');
|
||||
lines.push(`- Gate result: ${gate.pass ? 'PASS' : 'HOLD'}`);
|
||||
for (const criterion of gate.criteria) {
|
||||
lines.push(`- [${criterion.pass ? 'x' : ' '}] ${criterion.criterion}: actual=${criterion.actual}, threshold=${criterion.threshold}`);
|
||||
}
|
||||
lines.push('');
|
||||
}
|
||||
|
||||
return lines.join('\n');
|
||||
}
|
||||
Reference in New Issue
Block a user