feat(heartbeat): add provider error-rate spike check
This commit is contained in:
@@ -652,7 +652,7 @@ automation:
|
|||||||
heartbeat:
|
heartbeat:
|
||||||
enabled: true
|
enabled: true
|
||||||
interval: "5m" # Check every 5 minutes
|
interval: "5m" # Check every 5 minutes
|
||||||
checks: [gateway, model, channels, memory, disk, process_memory, backup]
|
checks: [gateway, model, channels, memory, disk, process_memory, backup, provider_errors]
|
||||||
notify:
|
notify:
|
||||||
channel: telegram
|
channel: telegram
|
||||||
peer: "123456789"
|
peer: "123456789"
|
||||||
@@ -660,6 +660,8 @@ automation:
|
|||||||
disk_threshold_mb: 100 # Warn when <100MB free
|
disk_threshold_mb: 100 # Warn when <100MB free
|
||||||
process_memory_threshold_mb: 1500 # Warn when RSS memory exceeds threshold
|
process_memory_threshold_mb: 1500 # Warn when RSS memory exceeds threshold
|
||||||
backup_failure_threshold: 1 # Warn when backup failures meet threshold
|
backup_failure_threshold: 1 # Warn when backup failures meet threshold
|
||||||
|
provider_error_rate_threshold: 0.5 # Warn when provider error rate >= threshold
|
||||||
|
provider_error_min_calls: 5 # Minimum model calls per provider before evaluation
|
||||||
```
|
```
|
||||||
|
|
||||||
### Heartbeat Checks
|
### Heartbeat Checks
|
||||||
@@ -673,6 +675,7 @@ automation:
|
|||||||
| `disk` | Free disk space exceeds threshold |
|
| `disk` | Free disk space exceeds threshold |
|
||||||
| `process_memory` | Flynn process RSS memory usage stays under threshold |
|
| `process_memory` | Flynn process RSS memory usage stays under threshold |
|
||||||
| `backup` | Backup scheduler consecutive failures stay under threshold |
|
| `backup` | Backup scheduler consecutive failures stay under threshold |
|
||||||
|
| `provider_errors` | Model provider error rates stay below threshold |
|
||||||
|
|
||||||
The monitor sends a notification when failures reach the configured threshold and a recovery notification when all checks pass again.
|
The monitor sends a notification when failures reach the configured threshold and a recovery notification when all checks pass again.
|
||||||
|
|
||||||
@@ -689,6 +692,8 @@ The monitor sends a notification when failures reach the configured threshold an
|
|||||||
| `disk_threshold_mb` | no | Disk space warning threshold in MB (default: `100`) |
|
| `disk_threshold_mb` | no | Disk space warning threshold in MB (default: `100`) |
|
||||||
| `process_memory_threshold_mb` | no | RSS memory threshold in MB for `process_memory` check (default: `1500`) |
|
| `process_memory_threshold_mb` | no | RSS memory threshold in MB for `process_memory` check (default: `1500`) |
|
||||||
| `backup_failure_threshold` | no | Consecutive backup failures threshold for `backup` check (default: `1`) |
|
| `backup_failure_threshold` | no | Consecutive backup failures threshold for `backup` check (default: `1`) |
|
||||||
|
| `provider_error_rate_threshold` | no | Error-rate threshold (0..1) for `provider_errors` check (default: `0.5`) |
|
||||||
|
| `provider_error_min_calls` | no | Minimum provider calls before applying error-rate threshold (default: `5`) |
|
||||||
|
|
||||||
## Gmail Pub/Sub Watcher
|
## Gmail Pub/Sub Watcher
|
||||||
|
|
||||||
|
|||||||
+3
-1
@@ -291,7 +291,7 @@ hooks:
|
|||||||
# heartbeat:
|
# heartbeat:
|
||||||
# enabled: false
|
# enabled: false
|
||||||
# interval: "5m"
|
# interval: "5m"
|
||||||
# checks: [gateway, model, channels, memory, disk, process_memory, backup]
|
# checks: [gateway, model, channels, memory, disk, process_memory, backup, provider_errors]
|
||||||
# notify:
|
# notify:
|
||||||
# channel: telegram
|
# channel: telegram
|
||||||
# peer: "123456789"
|
# peer: "123456789"
|
||||||
@@ -299,6 +299,8 @@ hooks:
|
|||||||
# disk_threshold_mb: 100
|
# disk_threshold_mb: 100
|
||||||
# process_memory_threshold_mb: 1500
|
# process_memory_threshold_mb: 1500
|
||||||
# backup_failure_threshold: 1
|
# backup_failure_threshold: 1
|
||||||
|
# provider_error_rate_threshold: 0.5
|
||||||
|
# provider_error_min_calls: 5
|
||||||
|
|
||||||
# ── Backup ──────────────────────────────────────────────────────────
|
# ── Backup ──────────────────────────────────────────────────────────
|
||||||
# Snapshot sessions.db, vectors.db (optional), and memory/ into a tarball.
|
# Snapshot sessions.db, vectors.db (optional), and memory/ into a tarball.
|
||||||
|
|||||||
@@ -7,7 +7,7 @@
|
|||||||
"status": "completed",
|
"status": "completed",
|
||||||
"date": "2026-02-16",
|
"date": "2026-02-16",
|
||||||
"updated": "2026-02-16",
|
"updated": "2026-02-16",
|
||||||
"summary": "Added first-class automation presets and scheduling upgrades: `automation.daily_briefing` now auto-registers an opinionated cron job for morning briefings, and backup scheduling now supports cron expressions via `backup.schedule` plus optional `backup.run_on_start` while preserving interval fallback. Added `BackupScheduler` with `backup.notify` channel alerts, configurable `backup.failure_threshold`, and recovery notifications (`backup.notify_recovery`) so backup failures/recoveries proactively notify operators. Extended heartbeat monitoring with `process_memory` and `backup` checks (with thresholds) so high RSS usage and backup failure streaks proactively trigger health alerts.",
|
"summary": "Added first-class automation presets and scheduling upgrades: `automation.daily_briefing` now auto-registers an opinionated cron job for morning briefings, and backup scheduling now supports cron expressions via `backup.schedule` plus optional `backup.run_on_start` while preserving interval fallback. Added `BackupScheduler` with `backup.notify` channel alerts, configurable `backup.failure_threshold`, and recovery notifications (`backup.notify_recovery`) so backup failures/recoveries proactively notify operators. Extended heartbeat monitoring with `process_memory`, `backup`, and `provider_errors` checks (with thresholds) so high RSS usage, backup failure streaks, and model-provider error spikes proactively trigger health alerts.",
|
||||||
"files_modified": [
|
"files_modified": [
|
||||||
"src/config/schema.ts",
|
"src/config/schema.ts",
|
||||||
"src/config/schema.test.ts",
|
"src/config/schema.test.ts",
|
||||||
@@ -24,6 +24,7 @@
|
|||||||
"src/daemon/channels.ts",
|
"src/daemon/channels.ts",
|
||||||
"src/daemon/channels.test.ts",
|
"src/daemon/channels.test.ts",
|
||||||
"src/daemon/index.ts",
|
"src/daemon/index.ts",
|
||||||
|
"src/daemon/services.ts",
|
||||||
"src/gateway/handlers/services.ts",
|
"src/gateway/handlers/services.ts",
|
||||||
"src/gateway/handlers/services.test.ts",
|
"src/gateway/handlers/services.test.ts",
|
||||||
"config/default.yaml",
|
"config/default.yaml",
|
||||||
@@ -3315,7 +3316,7 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"overall_progress": {
|
"overall_progress": {
|
||||||
"total_test_count": 1830,
|
"total_test_count": 1832,
|
||||||
"all_tests_passing": true,
|
"all_tests_passing": true,
|
||||||
"p0_completion": "3/3 (100%)",
|
"p0_completion": "3/3 (100%)",
|
||||||
"p1_completion": "4/4 (100%)",
|
"p1_completion": "4/4 (100%)",
|
||||||
|
|||||||
@@ -13,6 +13,8 @@ function makeConfig(overrides?: Partial<HeartbeatConfig>): HeartbeatConfig {
|
|||||||
disk_threshold_mb: 100,
|
disk_threshold_mb: 100,
|
||||||
process_memory_threshold_mb: 1500,
|
process_memory_threshold_mb: 1500,
|
||||||
backup_failure_threshold: 1,
|
backup_failure_threshold: 1,
|
||||||
|
provider_error_rate_threshold: 0.5,
|
||||||
|
provider_error_min_calls: 5,
|
||||||
...overrides,
|
...overrides,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@@ -37,6 +39,7 @@ function makeDeps(overrides?: Partial<HeartbeatDeps>): HeartbeatDeps {
|
|||||||
hasRun: false,
|
hasRun: false,
|
||||||
consecutiveFailures: 0,
|
consecutiveFailures: 0,
|
||||||
}),
|
}),
|
||||||
|
getModelCalls: () => [],
|
||||||
...overrides,
|
...overrides,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@@ -513,4 +516,41 @@ describe('HeartbeatMonitor', () => {
|
|||||||
expect(check.message).toContain('minio unavailable');
|
expect(check.message).toContain('minio unavailable');
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe('provider_errors check', () => {
|
||||||
|
it('passes when no model calls are recorded', async () => {
|
||||||
|
const deps = makeDeps({
|
||||||
|
config: makeConfig({ checks: ['provider_errors'] }),
|
||||||
|
getModelCalls: () => [],
|
||||||
|
});
|
||||||
|
monitor = new HeartbeatMonitor(deps);
|
||||||
|
|
||||||
|
const result = await monitor.runChecks();
|
||||||
|
const check = result.checks.find((c) => c.name === 'provider_errors');
|
||||||
|
if (!check) {throw new Error('Expected provider_errors check result');}
|
||||||
|
expect(check.healthy).toBe(true);
|
||||||
|
expect(check.message).toContain('No model calls');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('fails when a provider error rate breaches threshold', async () => {
|
||||||
|
const deps = makeDeps({
|
||||||
|
config: makeConfig({ checks: ['provider_errors'], provider_error_min_calls: 4, provider_error_rate_threshold: 0.5 }),
|
||||||
|
getModelCalls: () => [
|
||||||
|
{ provider: 'openai', error: 'rate limited' },
|
||||||
|
{ provider: 'openai', error: 'timeout' },
|
||||||
|
{ provider: 'openai' },
|
||||||
|
{ provider: 'openai' },
|
||||||
|
{ provider: 'anthropic' },
|
||||||
|
{ provider: 'anthropic' },
|
||||||
|
],
|
||||||
|
});
|
||||||
|
monitor = new HeartbeatMonitor(deps);
|
||||||
|
|
||||||
|
const result = await monitor.runChecks();
|
||||||
|
const check = result.checks.find((c) => c.name === 'provider_errors');
|
||||||
|
if (!check) {throw new Error('Expected provider_errors check result');}
|
||||||
|
expect(check.healthy).toBe(false);
|
||||||
|
expect(check.message).toContain('openai');
|
||||||
|
});
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -41,6 +41,7 @@ export interface HeartbeatDeps {
|
|||||||
channelLookup: ChannelLookup;
|
channelLookup: ChannelLookup;
|
||||||
processMemoryUsageMb?: () => number;
|
processMemoryUsageMb?: () => number;
|
||||||
backupHealthProvider?: () => BackupHealthSnapshot;
|
backupHealthProvider?: () => BackupHealthSnapshot;
|
||||||
|
getModelCalls?: () => Array<{ provider: string; error?: string }>;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -135,6 +136,9 @@ export class HeartbeatMonitor {
|
|||||||
case 'backup':
|
case 'backup':
|
||||||
result = this.checkBackup(start);
|
result = this.checkBackup(start);
|
||||||
break;
|
break;
|
||||||
|
case 'provider_errors':
|
||||||
|
result = this.checkProviderErrors(start);
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
result = { name: check, healthy: false, message: `Unknown check: ${check}`, durationMs: Date.now() - start };
|
result = { name: check, healthy: false, message: `Unknown check: ${check}`, durationMs: Date.now() - start };
|
||||||
}
|
}
|
||||||
@@ -388,6 +392,62 @@ export class HeartbeatMonitor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private checkProviderErrors(start: number): CheckResult {
|
||||||
|
try {
|
||||||
|
const calls = this.deps.getModelCalls ? this.deps.getModelCalls() : [];
|
||||||
|
if (calls.length === 0) {
|
||||||
|
return { name: 'provider_errors', healthy: true, message: 'No model calls recorded yet', durationMs: Date.now() - start };
|
||||||
|
}
|
||||||
|
|
||||||
|
const providers = new Map<string, { total: number; errors: number }>();
|
||||||
|
for (const call of calls) {
|
||||||
|
const current = providers.get(call.provider) ?? { total: 0, errors: 0 };
|
||||||
|
current.total += 1;
|
||||||
|
if (call.error) {
|
||||||
|
current.errors += 1;
|
||||||
|
}
|
||||||
|
providers.set(call.provider, current);
|
||||||
|
}
|
||||||
|
|
||||||
|
const minCalls = this.deps.config.provider_error_min_calls;
|
||||||
|
const threshold = this.deps.config.provider_error_rate_threshold;
|
||||||
|
const offenders: string[] = [];
|
||||||
|
|
||||||
|
for (const [provider, stats] of providers) {
|
||||||
|
if (stats.total < minCalls) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const errorRate = stats.errors / stats.total;
|
||||||
|
if (errorRate >= threshold) {
|
||||||
|
offenders.push(`${provider} ${Math.round(errorRate * 100)}% (${stats.errors}/${stats.total})`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (offenders.length > 0) {
|
||||||
|
return {
|
||||||
|
name: 'provider_errors',
|
||||||
|
healthy: false,
|
||||||
|
message: `High provider error rate: ${offenders.join(', ')}`,
|
||||||
|
durationMs: Date.now() - start,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
name: 'provider_errors',
|
||||||
|
healthy: true,
|
||||||
|
message: `Provider error rates below threshold across ${providers.size} provider(s)`,
|
||||||
|
durationMs: Date.now() - start,
|
||||||
|
};
|
||||||
|
} catch (err) {
|
||||||
|
return {
|
||||||
|
name: 'provider_errors',
|
||||||
|
healthy: false,
|
||||||
|
message: err instanceof Error ? err.message : 'Failed to check provider error rates',
|
||||||
|
durationMs: Date.now() - start,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ── Notification ───────────────────────────────────────────────
|
// ── Notification ───────────────────────────────────────────────
|
||||||
|
|
||||||
private async notify(text: string): Promise<void> {
|
private async notify(text: string): Promise<void> {
|
||||||
|
|||||||
@@ -967,8 +967,11 @@ describe('configSchema automation', () => {
|
|||||||
const result = configSchema.parse(baseConfig);
|
const result = configSchema.parse(baseConfig);
|
||||||
expect(result.automation.heartbeat.process_memory_threshold_mb).toBe(1500);
|
expect(result.automation.heartbeat.process_memory_threshold_mb).toBe(1500);
|
||||||
expect(result.automation.heartbeat.backup_failure_threshold).toBe(1);
|
expect(result.automation.heartbeat.backup_failure_threshold).toBe(1);
|
||||||
|
expect(result.automation.heartbeat.provider_error_rate_threshold).toBe(0.5);
|
||||||
|
expect(result.automation.heartbeat.provider_error_min_calls).toBe(5);
|
||||||
expect(result.automation.heartbeat.checks).toContain('process_memory');
|
expect(result.automation.heartbeat.checks).toContain('process_memory');
|
||||||
expect(result.automation.heartbeat.checks).toContain('backup');
|
expect(result.automation.heartbeat.checks).toContain('backup');
|
||||||
|
expect(result.automation.heartbeat.checks).toContain('provider_errors');
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -302,12 +302,12 @@ const gmailSchema = z.object({
|
|||||||
message: z.string().default('New email from {{from}}: {{subject}}\n\n{{snippet}}'),
|
message: z.string().default('New email from {{from}}: {{subject}}\n\n{{snippet}}'),
|
||||||
}).optional();
|
}).optional();
|
||||||
|
|
||||||
const heartbeatCheckSchema = z.enum(['gateway', 'model', 'channels', 'memory', 'disk', 'process_memory', 'backup']);
|
const heartbeatCheckSchema = z.enum(['gateway', 'model', 'channels', 'memory', 'disk', 'process_memory', 'backup', 'provider_errors']);
|
||||||
|
|
||||||
const heartbeatSchema = z.object({
|
const heartbeatSchema = z.object({
|
||||||
enabled: z.boolean().default(false),
|
enabled: z.boolean().default(false),
|
||||||
interval: z.string().default('5m'),
|
interval: z.string().default('5m'),
|
||||||
checks: z.array(heartbeatCheckSchema).default(['gateway', 'model', 'channels', 'memory', 'disk', 'process_memory', 'backup']),
|
checks: z.array(heartbeatCheckSchema).default(['gateway', 'model', 'channels', 'memory', 'disk', 'process_memory', 'backup', 'provider_errors']),
|
||||||
notify: z.object({
|
notify: z.object({
|
||||||
channel: z.string().min(1),
|
channel: z.string().min(1),
|
||||||
peer: z.string().min(1),
|
peer: z.string().min(1),
|
||||||
@@ -316,6 +316,8 @@ const heartbeatSchema = z.object({
|
|||||||
disk_threshold_mb: z.number().min(10).default(100),
|
disk_threshold_mb: z.number().min(10).default(100),
|
||||||
process_memory_threshold_mb: z.number().min(64).default(1500),
|
process_memory_threshold_mb: z.number().min(64).default(1500),
|
||||||
backup_failure_threshold: z.number().min(1).max(10).default(1),
|
backup_failure_threshold: z.number().min(1).max(10).default(1),
|
||||||
|
provider_error_rate_threshold: z.number().min(0).max(1).default(0.5),
|
||||||
|
provider_error_min_calls: z.number().min(1).default(5),
|
||||||
}).default({});
|
}).default({});
|
||||||
|
|
||||||
const gcalSchema = z.object({
|
const gcalSchema = z.object({
|
||||||
|
|||||||
@@ -479,6 +479,7 @@ export async function startServices(deps: {
|
|||||||
memoryDir: config.memory.enabled ? memoryDir : undefined,
|
memoryDir: config.memory.enabled ? memoryDir : undefined,
|
||||||
dataDir,
|
dataDir,
|
||||||
channelLookup: channelRegistry,
|
channelLookup: channelRegistry,
|
||||||
|
getModelCalls: () => gateway.getMetrics().getModelMetrics(),
|
||||||
});
|
});
|
||||||
|
|
||||||
heartbeatMonitor.start();
|
heartbeatMonitor.start();
|
||||||
|
|||||||
Reference in New Issue
Block a user