feat(audit): add sample-size gate thresholds for canary evaluation
This commit is contained in:
@@ -33,6 +33,9 @@ function usage(): string {
|
|||||||
' --out <path> Write output to file instead of stdout',
|
' --out <path> Write output to file instead of stdout',
|
||||||
'',
|
'',
|
||||||
'Gate options (optional):',
|
'Gate options (optional):',
|
||||||
|
' --gate-min-target-routes <number>',
|
||||||
|
' --gate-min-baseline-routes <number>',
|
||||||
|
' --gate-min-target-attempts <number>',
|
||||||
' --gate-max-completion-drop-pp <number>',
|
' --gate-max-completion-drop-pp <number>',
|
||||||
' --gate-max-p50-latency-increase-ms <number>',
|
' --gate-max-p50-latency-increase-ms <number>',
|
||||||
' --gate-max-p95-latency-increase-ms <number>',
|
' --gate-max-p95-latency-increase-ms <number>',
|
||||||
@@ -126,6 +129,9 @@ async function main(): Promise<void> {
|
|||||||
source: { type: 'string' },
|
source: { type: 'string' },
|
||||||
format: { type: 'string' },
|
format: { type: 'string' },
|
||||||
out: { type: 'string' },
|
out: { type: 'string' },
|
||||||
|
'gate-min-target-routes': { type: 'string' },
|
||||||
|
'gate-min-baseline-routes': { type: 'string' },
|
||||||
|
'gate-min-target-attempts': { type: 'string' },
|
||||||
'gate-max-completion-drop-pp': { type: 'string' },
|
'gate-max-completion-drop-pp': { type: 'string' },
|
||||||
'gate-max-p50-latency-increase-ms': { type: 'string' },
|
'gate-max-p50-latency-increase-ms': { type: 'string' },
|
||||||
'gate-max-p95-latency-increase-ms': { type: 'string' },
|
'gate-max-p95-latency-increase-ms': { type: 'string' },
|
||||||
@@ -171,6 +177,9 @@ async function main(): Promise<void> {
|
|||||||
const summary = summarizeBackendCanary(events, summaryOptions);
|
const summary = summarizeBackendCanary(events, summaryOptions);
|
||||||
|
|
||||||
const gateThresholds: BackendCanaryGateThresholds = {
|
const gateThresholds: BackendCanaryGateThresholds = {
|
||||||
|
minTargetRoutes: parseOptionalNumber(values['gate-min-target-routes'], '--gate-min-target-routes'),
|
||||||
|
minBaselineRoutes: parseOptionalNumber(values['gate-min-baseline-routes'], '--gate-min-baseline-routes'),
|
||||||
|
minTargetAttempts: parseOptionalNumber(values['gate-min-target-attempts'], '--gate-min-target-attempts'),
|
||||||
maxCompletionRateDropPp: parseOptionalNumber(values['gate-max-completion-drop-pp'], '--gate-max-completion-drop-pp'),
|
maxCompletionRateDropPp: parseOptionalNumber(values['gate-max-completion-drop-pp'], '--gate-max-completion-drop-pp'),
|
||||||
maxP50LatencyIncreaseMs: parseOptionalNumber(values['gate-max-p50-latency-increase-ms'], '--gate-max-p50-latency-increase-ms'),
|
maxP50LatencyIncreaseMs: parseOptionalNumber(values['gate-max-p50-latency-increase-ms'], '--gate-max-p50-latency-increase-ms'),
|
||||||
maxP95LatencyIncreaseMs: parseOptionalNumber(values['gate-max-p95-latency-increase-ms'], '--gate-max-p95-latency-increase-ms'),
|
maxP95LatencyIncreaseMs: parseOptionalNumber(values['gate-max-p95-latency-increase-ms'], '--gate-max-p95-latency-increase-ms'),
|
||||||
|
|||||||
@@ -149,6 +149,63 @@ describe('summarizeBackendCanary', () => {
|
|||||||
expect(summary.target.routes).toBe(1);
|
expect(summary.target.routes).toBe(1);
|
||||||
expect(summary.baseline.routes).toBe(0);
|
expect(summary.baseline.routes).toBe(0);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('classifies pi-specific fallback reasons into stable categories', () => {
|
||||||
|
const events: AuditEvent[] = [
|
||||||
|
makeEvent(1000, 'backend.route', {
|
||||||
|
session_id: 'telegram:canary',
|
||||||
|
channel: 'telegram',
|
||||||
|
sender: '8367012007',
|
||||||
|
selected_backend: 'pi_embedded',
|
||||||
|
source: 'agent_override',
|
||||||
|
}),
|
||||||
|
makeEvent(1100, 'backend.fallback', {
|
||||||
|
session_id: 'telegram:canary',
|
||||||
|
channel: 'telegram',
|
||||||
|
sender: '8367012007',
|
||||||
|
from_backend: 'pi_embedded',
|
||||||
|
to_backend: 'native',
|
||||||
|
reason: 'Loaded Pi module does not expose a supported session factory (expected one of: createAgentSession)',
|
||||||
|
duration_ms: 100,
|
||||||
|
}),
|
||||||
|
makeEvent(1200, 'session.message', {
|
||||||
|
session_id: 'telegram:canary',
|
||||||
|
role: 'assistant',
|
||||||
|
content_length: 20,
|
||||||
|
}),
|
||||||
|
makeEvent(1300, 'backend.route', {
|
||||||
|
session_id: 'telegram:canary',
|
||||||
|
channel: 'telegram',
|
||||||
|
sender: '8367012007',
|
||||||
|
selected_backend: 'pi_embedded',
|
||||||
|
source: 'agent_override',
|
||||||
|
}),
|
||||||
|
makeEvent(1400, 'backend.fallback', {
|
||||||
|
session_id: 'telegram:canary',
|
||||||
|
channel: 'telegram',
|
||||||
|
sender: '8367012007',
|
||||||
|
from_backend: 'pi_embedded',
|
||||||
|
to_backend: 'native',
|
||||||
|
reason: 'Pi Agent runtime produced no assistant text',
|
||||||
|
duration_ms: 120,
|
||||||
|
}),
|
||||||
|
makeEvent(1500, 'session.message', {
|
||||||
|
session_id: 'telegram:canary',
|
||||||
|
role: 'assistant',
|
||||||
|
content_length: 20,
|
||||||
|
}),
|
||||||
|
];
|
||||||
|
|
||||||
|
const summary = summarizeBackendCanary(events, {
|
||||||
|
targetBackend: 'pi_embedded',
|
||||||
|
baselineBackend: 'native',
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(summary.fallback_categories).toEqual([
|
||||||
|
{ category: 'empty_assistant_text', count: 1, pct: 50 },
|
||||||
|
{ category: 'pi_module_interface', count: 1, pct: 50 },
|
||||||
|
]);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('evaluateBackendCanaryGate', () => {
|
describe('evaluateBackendCanaryGate', () => {
|
||||||
@@ -210,4 +267,59 @@ describe('evaluateBackendCanaryGate', () => {
|
|||||||
expect(markdown).toContain('Pi Embedded Canary Summary');
|
expect(markdown).toContain('Pi Embedded Canary Summary');
|
||||||
expect(markdown).toContain('Gate result: PASS');
|
expect(markdown).toContain('Gate result: PASS');
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('fails gate when minimum sample thresholds are not met', () => {
|
||||||
|
const events: AuditEvent[] = [
|
||||||
|
makeEvent(1000, 'backend.route', {
|
||||||
|
session_id: 's1',
|
||||||
|
channel: 'telegram',
|
||||||
|
sender: '1',
|
||||||
|
selected_backend: 'pi_embedded',
|
||||||
|
source: 'agent_override',
|
||||||
|
}),
|
||||||
|
makeEvent(1100, 'backend.success', {
|
||||||
|
session_id: 's1',
|
||||||
|
channel: 'telegram',
|
||||||
|
sender: '1',
|
||||||
|
backend: 'pi_embedded',
|
||||||
|
duration_ms: 100,
|
||||||
|
response_length: 8,
|
||||||
|
}),
|
||||||
|
makeEvent(1200, 'session.message', {
|
||||||
|
session_id: 's1',
|
||||||
|
role: 'assistant',
|
||||||
|
content_length: 20,
|
||||||
|
}),
|
||||||
|
];
|
||||||
|
|
||||||
|
const summary = summarizeBackendCanary(events, {
|
||||||
|
targetBackend: 'pi_embedded',
|
||||||
|
baselineBackend: 'native',
|
||||||
|
});
|
||||||
|
|
||||||
|
const gate = evaluateBackendCanaryGate(summary, {
|
||||||
|
minTargetRoutes: 3,
|
||||||
|
minBaselineRoutes: 2,
|
||||||
|
minTargetAttempts: 3,
|
||||||
|
maxFallbackRatePct: 5,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(gate.pass).toBe(false);
|
||||||
|
expect(gate.criteria).toHaveLength(4);
|
||||||
|
expect(gate.criteria[0]).toEqual(expect.objectContaining({
|
||||||
|
criterion: 'Minimum target routes',
|
||||||
|
pass: false,
|
||||||
|
actual: '1',
|
||||||
|
}));
|
||||||
|
expect(gate.criteria[1]).toEqual(expect.objectContaining({
|
||||||
|
criterion: 'Minimum baseline routes',
|
||||||
|
pass: false,
|
||||||
|
actual: '0',
|
||||||
|
}));
|
||||||
|
expect(gate.criteria[2]).toEqual(expect.objectContaining({
|
||||||
|
criterion: 'Minimum target external attempts',
|
||||||
|
pass: false,
|
||||||
|
actual: '1',
|
||||||
|
}));
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -78,6 +78,9 @@ export interface BackendCanarySummary {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export interface BackendCanaryGateThresholds {
|
export interface BackendCanaryGateThresholds {
|
||||||
|
minTargetRoutes?: number;
|
||||||
|
minBaselineRoutes?: number;
|
||||||
|
minTargetAttempts?: number;
|
||||||
maxCompletionRateDropPp?: number;
|
maxCompletionRateDropPp?: number;
|
||||||
maxP50LatencyIncreaseMs?: number;
|
maxP50LatencyIncreaseMs?: number;
|
||||||
maxP95LatencyIncreaseMs?: number;
|
maxP95LatencyIncreaseMs?: number;
|
||||||
@@ -182,6 +185,18 @@ function normalizeFallbackCategory(reason: string): string {
|
|||||||
if (!normalized) {
|
if (!normalized) {
|
||||||
return 'unknown';
|
return 'unknown';
|
||||||
}
|
}
|
||||||
|
if (
|
||||||
|
normalized.includes('does not expose a supported session factory')
|
||||||
|
|| normalized.includes('expected one of: createagentsession')
|
||||||
|
) {
|
||||||
|
return 'pi_module_interface';
|
||||||
|
}
|
||||||
|
if (
|
||||||
|
normalized.includes('produced no assistant text')
|
||||||
|
|| normalized.includes('no assistant text')
|
||||||
|
) {
|
||||||
|
return 'empty_assistant_text';
|
||||||
|
}
|
||||||
if (normalized.includes('timeout') || normalized.includes('timed out')) {
|
if (normalized.includes('timeout') || normalized.includes('timed out')) {
|
||||||
return 'timeout';
|
return 'timeout';
|
||||||
}
|
}
|
||||||
@@ -470,6 +485,34 @@ export function evaluateBackendCanaryGate(
|
|||||||
): BackendCanaryGateResult {
|
): BackendCanaryGateResult {
|
||||||
const criteria: BackendCanaryGateResult['criteria'] = [];
|
const criteria: BackendCanaryGateResult['criteria'] = [];
|
||||||
|
|
||||||
|
if (typeof thresholds.minTargetRoutes === 'number') {
|
||||||
|
criteria.push({
|
||||||
|
criterion: 'Minimum target routes',
|
||||||
|
pass: summary.target.routes >= thresholds.minTargetRoutes,
|
||||||
|
actual: `${summary.target.routes}`,
|
||||||
|
threshold: `>= ${thresholds.minTargetRoutes}`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (typeof thresholds.minBaselineRoutes === 'number') {
|
||||||
|
criteria.push({
|
||||||
|
criterion: 'Minimum baseline routes',
|
||||||
|
pass: summary.baseline.routes >= thresholds.minBaselineRoutes,
|
||||||
|
actual: `${summary.baseline.routes}`,
|
||||||
|
threshold: `>= ${thresholds.minBaselineRoutes}`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (typeof thresholds.minTargetAttempts === 'number') {
|
||||||
|
const attempts = summary.target_external_attempts?.attempts ?? 0;
|
||||||
|
criteria.push({
|
||||||
|
criterion: 'Minimum target external attempts',
|
||||||
|
pass: attempts >= thresholds.minTargetAttempts,
|
||||||
|
actual: `${attempts}`,
|
||||||
|
threshold: `>= ${thresholds.minTargetAttempts}`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
if (typeof thresholds.maxCompletionRateDropPp === 'number') {
|
if (typeof thresholds.maxCompletionRateDropPp === 'number') {
|
||||||
const delta = summary.comparison.completion_rate_delta_pp;
|
const delta = summary.comparison.completion_rate_delta_pp;
|
||||||
const pass = delta !== null && delta >= (-thresholds.maxCompletionRateDropPp);
|
const pass = delta !== null && delta >= (-thresholds.maxCompletionRateDropPp);
|
||||||
|
|||||||
Reference in New Issue
Block a user