From 71af3b5a42707a4edbdb2e94b408c8baee8dff09 Mon Sep 17 00:00:00 2001 From: William Valentin Date: Mon, 16 Feb 2026 13:52:40 -0800 Subject: [PATCH] feat(heartbeat): add provider error-rate spike check --- README.md | 7 +++- config/default.yaml | 4 ++- docs/plans/state.json | 5 +-- src/automation/heartbeat.test.ts | 40 +++++++++++++++++++++ src/automation/heartbeat.ts | 60 ++++++++++++++++++++++++++++++++ src/config/schema.test.ts | 3 ++ src/config/schema.ts | 6 ++-- src/daemon/services.ts | 1 + 8 files changed, 120 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 08273de..eea5b87 100644 --- a/README.md +++ b/README.md @@ -652,7 +652,7 @@ automation: heartbeat: enabled: true interval: "5m" # Check every 5 minutes - checks: [gateway, model, channels, memory, disk, process_memory, backup] + checks: [gateway, model, channels, memory, disk, process_memory, backup, provider_errors] notify: channel: telegram peer: "123456789" @@ -660,6 +660,8 @@ automation: disk_threshold_mb: 100 # Warn when <100MB free process_memory_threshold_mb: 1500 # Warn when RSS memory exceeds threshold backup_failure_threshold: 1 # Warn when backup failures meet threshold + provider_error_rate_threshold: 0.5 # Warn when provider error rate >= threshold + provider_error_min_calls: 5 # Minimum model calls per provider before evaluation ``` ### Heartbeat Checks @@ -673,6 +675,7 @@ automation: | `disk` | Free disk space exceeds threshold | | `process_memory` | Flynn process RSS memory usage stays under threshold | | `backup` | Backup scheduler consecutive failures stay under threshold | +| `provider_errors` | Model provider error rates stay below threshold | The monitor sends a notification when failures reach the configured threshold and a recovery notification when all checks pass again. @@ -689,6 +692,8 @@ The monitor sends a notification when failures reach the configured threshold an | `disk_threshold_mb` | no | Disk space warning threshold in MB (default: `100`) | | `process_memory_threshold_mb` | no | RSS memory threshold in MB for `process_memory` check (default: `1500`) | | `backup_failure_threshold` | no | Consecutive backup failures threshold for `backup` check (default: `1`) | +| `provider_error_rate_threshold` | no | Error-rate threshold (0..1) for `provider_errors` check (default: `0.5`) | +| `provider_error_min_calls` | no | Minimum provider calls before applying error-rate threshold (default: `5`) | ## Gmail Pub/Sub Watcher diff --git a/config/default.yaml b/config/default.yaml index d2eeab4..2c0ede4 100644 --- a/config/default.yaml +++ b/config/default.yaml @@ -291,7 +291,7 @@ hooks: # heartbeat: # enabled: false # interval: "5m" -# checks: [gateway, model, channels, memory, disk, process_memory, backup] +# checks: [gateway, model, channels, memory, disk, process_memory, backup, provider_errors] # notify: # channel: telegram # peer: "123456789" @@ -299,6 +299,8 @@ hooks: # disk_threshold_mb: 100 # process_memory_threshold_mb: 1500 # backup_failure_threshold: 1 +# provider_error_rate_threshold: 0.5 +# provider_error_min_calls: 5 # ── Backup ────────────────────────────────────────────────────────── # Snapshot sessions.db, vectors.db (optional), and memory/ into a tarball. diff --git a/docs/plans/state.json b/docs/plans/state.json index f715197..981598c 100644 --- a/docs/plans/state.json +++ b/docs/plans/state.json @@ -7,7 +7,7 @@ "status": "completed", "date": "2026-02-16", "updated": "2026-02-16", - "summary": "Added first-class automation presets and scheduling upgrades: `automation.daily_briefing` now auto-registers an opinionated cron job for morning briefings, and backup scheduling now supports cron expressions via `backup.schedule` plus optional `backup.run_on_start` while preserving interval fallback. Added `BackupScheduler` with `backup.notify` channel alerts, configurable `backup.failure_threshold`, and recovery notifications (`backup.notify_recovery`) so backup failures/recoveries proactively notify operators. Extended heartbeat monitoring with `process_memory` and `backup` checks (with thresholds) so high RSS usage and backup failure streaks proactively trigger health alerts.", + "summary": "Added first-class automation presets and scheduling upgrades: `automation.daily_briefing` now auto-registers an opinionated cron job for morning briefings, and backup scheduling now supports cron expressions via `backup.schedule` plus optional `backup.run_on_start` while preserving interval fallback. Added `BackupScheduler` with `backup.notify` channel alerts, configurable `backup.failure_threshold`, and recovery notifications (`backup.notify_recovery`) so backup failures/recoveries proactively notify operators. Extended heartbeat monitoring with `process_memory`, `backup`, and `provider_errors` checks (with thresholds) so high RSS usage, backup failure streaks, and model-provider error spikes proactively trigger health alerts.", "files_modified": [ "src/config/schema.ts", "src/config/schema.test.ts", @@ -24,6 +24,7 @@ "src/daemon/channels.ts", "src/daemon/channels.test.ts", "src/daemon/index.ts", + "src/daemon/services.ts", "src/gateway/handlers/services.ts", "src/gateway/handlers/services.test.ts", "config/default.yaml", @@ -3315,7 +3316,7 @@ } }, "overall_progress": { - "total_test_count": 1830, + "total_test_count": 1832, "all_tests_passing": true, "p0_completion": "3/3 (100%)", "p1_completion": "4/4 (100%)", diff --git a/src/automation/heartbeat.test.ts b/src/automation/heartbeat.test.ts index 3f41042..ca0591e 100644 --- a/src/automation/heartbeat.test.ts +++ b/src/automation/heartbeat.test.ts @@ -13,6 +13,8 @@ function makeConfig(overrides?: Partial): HeartbeatConfig { disk_threshold_mb: 100, process_memory_threshold_mb: 1500, backup_failure_threshold: 1, + provider_error_rate_threshold: 0.5, + provider_error_min_calls: 5, ...overrides, }; } @@ -37,6 +39,7 @@ function makeDeps(overrides?: Partial): HeartbeatDeps { hasRun: false, consecutiveFailures: 0, }), + getModelCalls: () => [], ...overrides, }; } @@ -513,4 +516,41 @@ describe('HeartbeatMonitor', () => { expect(check.message).toContain('minio unavailable'); }); }); + + describe('provider_errors check', () => { + it('passes when no model calls are recorded', async () => { + const deps = makeDeps({ + config: makeConfig({ checks: ['provider_errors'] }), + getModelCalls: () => [], + }); + monitor = new HeartbeatMonitor(deps); + + const result = await monitor.runChecks(); + const check = result.checks.find((c) => c.name === 'provider_errors'); + if (!check) {throw new Error('Expected provider_errors check result');} + expect(check.healthy).toBe(true); + expect(check.message).toContain('No model calls'); + }); + + it('fails when a provider error rate breaches threshold', async () => { + const deps = makeDeps({ + config: makeConfig({ checks: ['provider_errors'], provider_error_min_calls: 4, provider_error_rate_threshold: 0.5 }), + getModelCalls: () => [ + { provider: 'openai', error: 'rate limited' }, + { provider: 'openai', error: 'timeout' }, + { provider: 'openai' }, + { provider: 'openai' }, + { provider: 'anthropic' }, + { provider: 'anthropic' }, + ], + }); + monitor = new HeartbeatMonitor(deps); + + const result = await monitor.runChecks(); + const check = result.checks.find((c) => c.name === 'provider_errors'); + if (!check) {throw new Error('Expected provider_errors check result');} + expect(check.healthy).toBe(false); + expect(check.message).toContain('openai'); + }); + }); }); diff --git a/src/automation/heartbeat.ts b/src/automation/heartbeat.ts index 223fd5f..6561cce 100644 --- a/src/automation/heartbeat.ts +++ b/src/automation/heartbeat.ts @@ -41,6 +41,7 @@ export interface HeartbeatDeps { channelLookup: ChannelLookup; processMemoryUsageMb?: () => number; backupHealthProvider?: () => BackupHealthSnapshot; + getModelCalls?: () => Array<{ provider: string; error?: string }>; } /** @@ -135,6 +136,9 @@ export class HeartbeatMonitor { case 'backup': result = this.checkBackup(start); break; + case 'provider_errors': + result = this.checkProviderErrors(start); + break; default: result = { name: check, healthy: false, message: `Unknown check: ${check}`, durationMs: Date.now() - start }; } @@ -388,6 +392,62 @@ export class HeartbeatMonitor { } } + private checkProviderErrors(start: number): CheckResult { + try { + const calls = this.deps.getModelCalls ? this.deps.getModelCalls() : []; + if (calls.length === 0) { + return { name: 'provider_errors', healthy: true, message: 'No model calls recorded yet', durationMs: Date.now() - start }; + } + + const providers = new Map(); + for (const call of calls) { + const current = providers.get(call.provider) ?? { total: 0, errors: 0 }; + current.total += 1; + if (call.error) { + current.errors += 1; + } + providers.set(call.provider, current); + } + + const minCalls = this.deps.config.provider_error_min_calls; + const threshold = this.deps.config.provider_error_rate_threshold; + const offenders: string[] = []; + + for (const [provider, stats] of providers) { + if (stats.total < minCalls) { + continue; + } + const errorRate = stats.errors / stats.total; + if (errorRate >= threshold) { + offenders.push(`${provider} ${Math.round(errorRate * 100)}% (${stats.errors}/${stats.total})`); + } + } + + if (offenders.length > 0) { + return { + name: 'provider_errors', + healthy: false, + message: `High provider error rate: ${offenders.join(', ')}`, + durationMs: Date.now() - start, + }; + } + + return { + name: 'provider_errors', + healthy: true, + message: `Provider error rates below threshold across ${providers.size} provider(s)`, + durationMs: Date.now() - start, + }; + } catch (err) { + return { + name: 'provider_errors', + healthy: false, + message: err instanceof Error ? err.message : 'Failed to check provider error rates', + durationMs: Date.now() - start, + }; + } + } + // ── Notification ─────────────────────────────────────────────── private async notify(text: string): Promise { diff --git a/src/config/schema.test.ts b/src/config/schema.test.ts index b59000b..cc08337 100644 --- a/src/config/schema.test.ts +++ b/src/config/schema.test.ts @@ -967,8 +967,11 @@ describe('configSchema automation', () => { const result = configSchema.parse(baseConfig); expect(result.automation.heartbeat.process_memory_threshold_mb).toBe(1500); expect(result.automation.heartbeat.backup_failure_threshold).toBe(1); + expect(result.automation.heartbeat.provider_error_rate_threshold).toBe(0.5); + expect(result.automation.heartbeat.provider_error_min_calls).toBe(5); expect(result.automation.heartbeat.checks).toContain('process_memory'); expect(result.automation.heartbeat.checks).toContain('backup'); + expect(result.automation.heartbeat.checks).toContain('provider_errors'); }); }); diff --git a/src/config/schema.ts b/src/config/schema.ts index 93d19ea..748eb29 100644 --- a/src/config/schema.ts +++ b/src/config/schema.ts @@ -302,12 +302,12 @@ const gmailSchema = z.object({ message: z.string().default('New email from {{from}}: {{subject}}\n\n{{snippet}}'), }).optional(); -const heartbeatCheckSchema = z.enum(['gateway', 'model', 'channels', 'memory', 'disk', 'process_memory', 'backup']); +const heartbeatCheckSchema = z.enum(['gateway', 'model', 'channels', 'memory', 'disk', 'process_memory', 'backup', 'provider_errors']); const heartbeatSchema = z.object({ enabled: z.boolean().default(false), interval: z.string().default('5m'), - checks: z.array(heartbeatCheckSchema).default(['gateway', 'model', 'channels', 'memory', 'disk', 'process_memory', 'backup']), + checks: z.array(heartbeatCheckSchema).default(['gateway', 'model', 'channels', 'memory', 'disk', 'process_memory', 'backup', 'provider_errors']), notify: z.object({ channel: z.string().min(1), peer: z.string().min(1), @@ -316,6 +316,8 @@ const heartbeatSchema = z.object({ disk_threshold_mb: z.number().min(10).default(100), process_memory_threshold_mb: z.number().min(64).default(1500), backup_failure_threshold: z.number().min(1).max(10).default(1), + provider_error_rate_threshold: z.number().min(0).max(1).default(0.5), + provider_error_min_calls: z.number().min(1).default(5), }).default({}); const gcalSchema = z.object({ diff --git a/src/daemon/services.ts b/src/daemon/services.ts index b9a001e..78193f9 100644 --- a/src/daemon/services.ts +++ b/src/daemon/services.ts @@ -479,6 +479,7 @@ export async function startServices(deps: { memoryDir: config.memory.enabled ? memoryDir : undefined, dataDir, channelLookup: channelRegistry, + getModelCalls: () => gateway.getMetrics().getModelMetrics(), }); heartbeatMonitor.start();