feat(heartbeat): add provider error-rate spike check
This commit is contained in:
@@ -41,6 +41,7 @@ export interface HeartbeatDeps {
|
||||
channelLookup: ChannelLookup;
|
||||
processMemoryUsageMb?: () => number;
|
||||
backupHealthProvider?: () => BackupHealthSnapshot;
|
||||
getModelCalls?: () => Array<{ provider: string; error?: string }>;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -135,6 +136,9 @@ export class HeartbeatMonitor {
|
||||
case 'backup':
|
||||
result = this.checkBackup(start);
|
||||
break;
|
||||
case 'provider_errors':
|
||||
result = this.checkProviderErrors(start);
|
||||
break;
|
||||
default:
|
||||
result = { name: check, healthy: false, message: `Unknown check: ${check}`, durationMs: Date.now() - start };
|
||||
}
|
||||
@@ -388,6 +392,62 @@ export class HeartbeatMonitor {
|
||||
}
|
||||
}
|
||||
|
||||
private checkProviderErrors(start: number): CheckResult {
|
||||
try {
|
||||
const calls = this.deps.getModelCalls ? this.deps.getModelCalls() : [];
|
||||
if (calls.length === 0) {
|
||||
return { name: 'provider_errors', healthy: true, message: 'No model calls recorded yet', durationMs: Date.now() - start };
|
||||
}
|
||||
|
||||
const providers = new Map<string, { total: number; errors: number }>();
|
||||
for (const call of calls) {
|
||||
const current = providers.get(call.provider) ?? { total: 0, errors: 0 };
|
||||
current.total += 1;
|
||||
if (call.error) {
|
||||
current.errors += 1;
|
||||
}
|
||||
providers.set(call.provider, current);
|
||||
}
|
||||
|
||||
const minCalls = this.deps.config.provider_error_min_calls;
|
||||
const threshold = this.deps.config.provider_error_rate_threshold;
|
||||
const offenders: string[] = [];
|
||||
|
||||
for (const [provider, stats] of providers) {
|
||||
if (stats.total < minCalls) {
|
||||
continue;
|
||||
}
|
||||
const errorRate = stats.errors / stats.total;
|
||||
if (errorRate >= threshold) {
|
||||
offenders.push(`${provider} ${Math.round(errorRate * 100)}% (${stats.errors}/${stats.total})`);
|
||||
}
|
||||
}
|
||||
|
||||
if (offenders.length > 0) {
|
||||
return {
|
||||
name: 'provider_errors',
|
||||
healthy: false,
|
||||
message: `High provider error rate: ${offenders.join(', ')}`,
|
||||
durationMs: Date.now() - start,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
name: 'provider_errors',
|
||||
healthy: true,
|
||||
message: `Provider error rates below threshold across ${providers.size} provider(s)`,
|
||||
durationMs: Date.now() - start,
|
||||
};
|
||||
} catch (err) {
|
||||
return {
|
||||
name: 'provider_errors',
|
||||
healthy: false,
|
||||
message: err instanceof Error ? err.message : 'Failed to check provider error rates',
|
||||
durationMs: Date.now() - start,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// ── Notification ───────────────────────────────────────────────
|
||||
|
||||
private async notify(text: string): Promise<void> {
|
||||
|
||||
Reference in New Issue
Block a user