feat(heartbeat): add provider error-rate spike check

This commit is contained in:
William Valentin
2026-02-16 13:52:40 -08:00
parent 07340ff0af
commit 71af3b5a42
8 changed files with 120 additions and 6 deletions
+40
View File
@@ -13,6 +13,8 @@ function makeConfig(overrides?: Partial<HeartbeatConfig>): HeartbeatConfig {
disk_threshold_mb: 100,
process_memory_threshold_mb: 1500,
backup_failure_threshold: 1,
provider_error_rate_threshold: 0.5,
provider_error_min_calls: 5,
...overrides,
};
}
@@ -37,6 +39,7 @@ function makeDeps(overrides?: Partial<HeartbeatDeps>): HeartbeatDeps {
hasRun: false,
consecutiveFailures: 0,
}),
getModelCalls: () => [],
...overrides,
};
}
@@ -513,4 +516,41 @@ describe('HeartbeatMonitor', () => {
expect(check.message).toContain('minio unavailable');
});
});
describe('provider_errors check', () => {
it('passes when no model calls are recorded', async () => {
const deps = makeDeps({
config: makeConfig({ checks: ['provider_errors'] }),
getModelCalls: () => [],
});
monitor = new HeartbeatMonitor(deps);
const result = await monitor.runChecks();
const check = result.checks.find((c) => c.name === 'provider_errors');
if (!check) {throw new Error('Expected provider_errors check result');}
expect(check.healthy).toBe(true);
expect(check.message).toContain('No model calls');
});
it('fails when a provider error rate breaches threshold', async () => {
const deps = makeDeps({
config: makeConfig({ checks: ['provider_errors'], provider_error_min_calls: 4, provider_error_rate_threshold: 0.5 }),
getModelCalls: () => [
{ provider: 'openai', error: 'rate limited' },
{ provider: 'openai', error: 'timeout' },
{ provider: 'openai' },
{ provider: 'openai' },
{ provider: 'anthropic' },
{ provider: 'anthropic' },
],
});
monitor = new HeartbeatMonitor(deps);
const result = await monitor.runChecks();
const check = result.checks.find((c) => c.name === 'provider_errors');
if (!check) {throw new Error('Expected provider_errors check result');}
expect(check.healthy).toBe(false);
expect(check.message).toContain('openai');
});
});
});
+60
View File
@@ -41,6 +41,7 @@ export interface HeartbeatDeps {
channelLookup: ChannelLookup;
processMemoryUsageMb?: () => number;
backupHealthProvider?: () => BackupHealthSnapshot;
getModelCalls?: () => Array<{ provider: string; error?: string }>;
}
/**
@@ -135,6 +136,9 @@ export class HeartbeatMonitor {
case 'backup':
result = this.checkBackup(start);
break;
case 'provider_errors':
result = this.checkProviderErrors(start);
break;
default:
result = { name: check, healthy: false, message: `Unknown check: ${check}`, durationMs: Date.now() - start };
}
@@ -388,6 +392,62 @@ export class HeartbeatMonitor {
}
}
private checkProviderErrors(start: number): CheckResult {
try {
const calls = this.deps.getModelCalls ? this.deps.getModelCalls() : [];
if (calls.length === 0) {
return { name: 'provider_errors', healthy: true, message: 'No model calls recorded yet', durationMs: Date.now() - start };
}
const providers = new Map<string, { total: number; errors: number }>();
for (const call of calls) {
const current = providers.get(call.provider) ?? { total: 0, errors: 0 };
current.total += 1;
if (call.error) {
current.errors += 1;
}
providers.set(call.provider, current);
}
const minCalls = this.deps.config.provider_error_min_calls;
const threshold = this.deps.config.provider_error_rate_threshold;
const offenders: string[] = [];
for (const [provider, stats] of providers) {
if (stats.total < minCalls) {
continue;
}
const errorRate = stats.errors / stats.total;
if (errorRate >= threshold) {
offenders.push(`${provider} ${Math.round(errorRate * 100)}% (${stats.errors}/${stats.total})`);
}
}
if (offenders.length > 0) {
return {
name: 'provider_errors',
healthy: false,
message: `High provider error rate: ${offenders.join(', ')}`,
durationMs: Date.now() - start,
};
}
return {
name: 'provider_errors',
healthy: true,
message: `Provider error rates below threshold across ${providers.size} provider(s)`,
durationMs: Date.now() - start,
};
} catch (err) {
return {
name: 'provider_errors',
healthy: false,
message: err instanceof Error ? err.message : 'Failed to check provider error rates',
durationMs: Date.now() - start,
};
}
}
// ── Notification ───────────────────────────────────────────────
private async notify(text: string): Promise<void> {
+3
View File
@@ -967,8 +967,11 @@ describe('configSchema automation', () => {
const result = configSchema.parse(baseConfig);
expect(result.automation.heartbeat.process_memory_threshold_mb).toBe(1500);
expect(result.automation.heartbeat.backup_failure_threshold).toBe(1);
expect(result.automation.heartbeat.provider_error_rate_threshold).toBe(0.5);
expect(result.automation.heartbeat.provider_error_min_calls).toBe(5);
expect(result.automation.heartbeat.checks).toContain('process_memory');
expect(result.automation.heartbeat.checks).toContain('backup');
expect(result.automation.heartbeat.checks).toContain('provider_errors');
});
});
+4 -2
View File
@@ -302,12 +302,12 @@ const gmailSchema = z.object({
message: z.string().default('New email from {{from}}: {{subject}}\n\n{{snippet}}'),
}).optional();
const heartbeatCheckSchema = z.enum(['gateway', 'model', 'channels', 'memory', 'disk', 'process_memory', 'backup']);
const heartbeatCheckSchema = z.enum(['gateway', 'model', 'channels', 'memory', 'disk', 'process_memory', 'backup', 'provider_errors']);
const heartbeatSchema = z.object({
enabled: z.boolean().default(false),
interval: z.string().default('5m'),
checks: z.array(heartbeatCheckSchema).default(['gateway', 'model', 'channels', 'memory', 'disk', 'process_memory', 'backup']),
checks: z.array(heartbeatCheckSchema).default(['gateway', 'model', 'channels', 'memory', 'disk', 'process_memory', 'backup', 'provider_errors']),
notify: z.object({
channel: z.string().min(1),
peer: z.string().min(1),
@@ -316,6 +316,8 @@ const heartbeatSchema = z.object({
disk_threshold_mb: z.number().min(10).default(100),
process_memory_threshold_mb: z.number().min(64).default(1500),
backup_failure_threshold: z.number().min(1).max(10).default(1),
provider_error_rate_threshold: z.number().min(0).max(1).default(0.5),
provider_error_min_calls: z.number().min(1).default(5),
}).default({});
const gcalSchema = z.object({
+1
View File
@@ -479,6 +479,7 @@ export async function startServices(deps: {
memoryDir: config.memory.enabled ? memoryDir : undefined,
dataDir,
channelLookup: channelRegistry,
getModelCalls: () => gateway.getMetrics().getModelMetrics(),
});
heartbeatMonitor.start();