feat(heartbeat): add process memory and backup health checks

This commit is contained in:
William Valentin
2026-02-16 13:50:39 -08:00
parent 8684c3a07d
commit 07340ff0af
11 changed files with 282 additions and 8 deletions
+78 -1
View File
@@ -8,9 +8,11 @@ function makeConfig(overrides?: Partial<HeartbeatConfig>): HeartbeatConfig {
return {
enabled: true,
interval: '5m',
checks: ['gateway', 'model', 'channels', 'memory', 'disk'],
checks: ['gateway', 'model', 'channels', 'memory', 'disk', 'process_memory', 'backup'],
failure_threshold: 2,
disk_threshold_mb: 100,
process_memory_threshold_mb: 1500,
backup_failure_threshold: 1,
...overrides,
};
}
@@ -29,6 +31,12 @@ function makeDeps(overrides?: Partial<HeartbeatDeps>): HeartbeatDeps {
memoryDir: '/tmp/flynn-test-memory',
dataDir: '/tmp',
channelLookup: { get: vi.fn() },
processMemoryUsageMb: () => 256,
backupHealthProvider: () => ({
enabled: false,
hasRun: false,
consecutiveFailures: 0,
}),
...overrides,
};
}
@@ -436,4 +444,73 @@ describe('HeartbeatMonitor', () => {
expect(check.healthy).toBe(false);
});
});
describe('process_memory check', () => {
it('passes when RSS is below threshold', async () => {
const deps = makeDeps({
config: makeConfig({ checks: ['process_memory'], process_memory_threshold_mb: 512 }),
processMemoryUsageMb: () => 200,
});
monitor = new HeartbeatMonitor(deps);
const result = await monitor.runChecks();
const check = result.checks.find((c) => c.name === 'process_memory');
if (!check) {throw new Error('Expected process_memory check result');}
expect(check.healthy).toBe(true);
});
it('fails when RSS is above threshold', async () => {
const deps = makeDeps({
config: makeConfig({ checks: ['process_memory'], process_memory_threshold_mb: 128 }),
processMemoryUsageMb: () => 512,
});
monitor = new HeartbeatMonitor(deps);
const result = await monitor.runChecks();
const check = result.checks.find((c) => c.name === 'process_memory');
if (!check) {throw new Error('Expected process_memory check result');}
expect(check.healthy).toBe(false);
expect(check.message).toContain('High memory usage');
});
});
describe('backup check', () => {
it('passes when backup is disabled', async () => {
const deps = makeDeps({
config: makeConfig({ checks: ['backup'] }),
backupHealthProvider: () => ({
enabled: false,
hasRun: false,
consecutiveFailures: 0,
}),
});
monitor = new HeartbeatMonitor(deps);
const result = await monitor.runChecks();
const check = result.checks.find((c) => c.name === 'backup');
if (!check) {throw new Error('Expected backup check result');}
expect(check.healthy).toBe(true);
expect(check.message).toContain('disabled');
});
it('fails when backup consecutive failures exceed threshold', async () => {
const deps = makeDeps({
config: makeConfig({ checks: ['backup'], backup_failure_threshold: 2 }),
backupHealthProvider: () => ({
enabled: true,
hasRun: true,
consecutiveFailures: 3,
lastError: 'minio unavailable',
}),
});
monitor = new HeartbeatMonitor(deps);
const result = await monitor.runChecks();
const check = result.checks.find((c) => c.name === 'backup');
if (!check) {throw new Error('Expected backup check result');}
expect(check.healthy).toBe(false);
expect(check.message).toContain('Backup failing');
expect(check.message).toContain('minio unavailable');
});
});
});
+76
View File
@@ -2,6 +2,7 @@ import { statfsSync, accessSync, constants as fsConstants } from 'fs';
import { request } from 'http';
import type { HeartbeatConfig, HeartbeatCheck } from '../config/schema.js';
import type { ChannelAdapter, OutboundMessage } from '../channels/types.js';
import { getBackupHealthSnapshot, type BackupHealthSnapshot } from '../backup/index.js';
import { auditLogger } from '../audit/index.js';
/** Result of a single health check. */
@@ -38,6 +39,8 @@ export interface HeartbeatDeps {
memoryDir: string | undefined;
dataDir: string;
channelLookup: ChannelLookup;
processMemoryUsageMb?: () => number;
backupHealthProvider?: () => BackupHealthSnapshot;
}
/**
@@ -126,6 +129,12 @@ export class HeartbeatMonitor {
case 'disk':
result = this.checkDisk(start);
break;
case 'process_memory':
result = this.checkProcessMemory(start);
break;
case 'backup':
result = this.checkBackup(start);
break;
default:
result = { name: check, healthy: false, message: `Unknown check: ${check}`, durationMs: Date.now() - start };
}
@@ -312,6 +321,73 @@ export class HeartbeatMonitor {
}
}
private checkProcessMemory(start: number): CheckResult {
try {
const usageMb = this.deps.processMemoryUsageMb
? this.deps.processMemoryUsageMb()
: Math.round(process.memoryUsage().rss / (1024 * 1024));
const thresholdMb = this.deps.config.process_memory_threshold_mb;
const healthy = usageMb <= thresholdMb;
return {
name: 'process_memory',
healthy,
message: healthy
? `Process RSS ${usageMb} MB (threshold: ${thresholdMb} MB)`
: `High memory usage: ${usageMb} MB RSS (threshold: ${thresholdMb} MB)`,
durationMs: Date.now() - start,
};
} catch (err) {
return {
name: 'process_memory',
healthy: false,
message: err instanceof Error ? err.message : 'Failed to check process memory usage',
durationMs: Date.now() - start,
};
}
}
private checkBackup(start: number): CheckResult {
try {
const snapshot = this.deps.backupHealthProvider
? this.deps.backupHealthProvider()
: getBackupHealthSnapshot();
if (!snapshot.enabled) {
return { name: 'backup', healthy: true, message: 'Backup scheduler disabled', durationMs: Date.now() - start };
}
const threshold = this.deps.config.backup_failure_threshold;
const healthy = snapshot.consecutiveFailures < threshold;
if (!snapshot.hasRun) {
return { name: 'backup', healthy: true, message: 'No backup runs recorded yet', durationMs: Date.now() - start };
}
if (healthy) {
const successSuffix = snapshot.lastSuccessAt ? `, last success ${new Date(snapshot.lastSuccessAt).toISOString()}` : '';
return {
name: 'backup',
healthy: true,
message: `Consecutive failures: ${snapshot.consecutiveFailures}/${threshold}${successSuffix}`,
durationMs: Date.now() - start,
};
}
return {
name: 'backup',
healthy: false,
message: `Backup failing (${snapshot.consecutiveFailures} consecutive failures, threshold: ${threshold})${snapshot.lastError ? `${snapshot.lastError}` : ''}`,
durationMs: Date.now() - start,
};
} catch (err) {
return {
name: 'backup',
healthy: false,
message: err instanceof Error ? err.message : 'Failed to read backup health state',
durationMs: Date.now() - start,
};
}
}
// ── Notification ───────────────────────────────────────────────
private async notify(text: string): Promise<void> {
+7
View File
@@ -1,2 +1,9 @@
export { runBackupSnapshot, backupInternals, type BackupRunOptions, type BackupResult } from './run.js';
export { BackupScheduler, type BackupSchedulerDeps } from './scheduler.js';
export {
getBackupHealthSnapshot,
initBackupHealth,
markBackupSuccess,
markBackupFailure,
type BackupHealthSnapshot,
} from './status.js';
+4
View File
@@ -3,6 +3,7 @@ import type { BackupConfig } from '../config/schema.js';
import type { OutboundMessage } from '../channels/types.js';
import { parseDuration } from '../session/index.js';
import { runBackupSnapshot } from './run.js';
import { initBackupHealth, markBackupFailure, markBackupSuccess } from './status.js';
interface ChannelLookup {
get(name: string): { send(peerId: string, message: OutboundMessage): Promise<void> } | undefined;
@@ -28,6 +29,7 @@ export class BackupScheduler {
}
start(): void {
initBackupHealth(this.deps.backupConfig.enabled);
if (!this.deps.backupConfig.enabled) {
return;
}
@@ -92,10 +94,12 @@ export class BackupScheduler {
backupConfig: this.deps.backupConfig,
});
console.log(`Backup completed: ${result.archivePath}${result.uploaded && result.remotePath ? ` -> ${result.remotePath}` : ''}`);
markBackupSuccess();
await this.handleSuccess();
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
console.error(`Backup failed: ${message}`);
markBackupFailure(message);
await this.handleFailure(message);
} finally {
this.running = false;
+42
View File
@@ -0,0 +1,42 @@
import { describe, expect, it } from 'vitest';
import {
getBackupHealthSnapshot,
initBackupHealth,
markBackupFailure,
markBackupSuccess,
} from './status.js';
describe('backup status tracker', () => {
it('tracks failures and resets on success', () => {
initBackupHealth(true);
expect(getBackupHealthSnapshot()).toMatchObject({
enabled: true,
hasRun: false,
consecutiveFailures: 0,
});
markBackupFailure('upload failed', 123);
expect(getBackupHealthSnapshot()).toMatchObject({
enabled: true,
hasRun: true,
consecutiveFailures: 1,
lastFailureAt: 123,
lastError: 'upload failed',
});
markBackupFailure('upload failed again', 456);
expect(getBackupHealthSnapshot()).toMatchObject({
consecutiveFailures: 2,
lastFailureAt: 456,
lastError: 'upload failed again',
});
markBackupSuccess(789);
expect(getBackupHealthSnapshot()).toMatchObject({
hasRun: true,
consecutiveFailures: 0,
lastSuccessAt: 789,
lastError: undefined,
});
});
});
+46
View File
@@ -0,0 +1,46 @@
export interface BackupHealthSnapshot {
enabled: boolean;
hasRun: boolean;
consecutiveFailures: number;
lastSuccessAt?: number;
lastFailureAt?: number;
lastError?: string;
}
let snapshot: BackupHealthSnapshot = {
enabled: false,
hasRun: false,
consecutiveFailures: 0,
};
export function initBackupHealth(enabled: boolean): void {
snapshot = {
enabled,
hasRun: false,
consecutiveFailures: 0,
};
}
export function markBackupSuccess(now = Date.now()): void {
snapshot = {
...snapshot,
hasRun: true,
consecutiveFailures: 0,
lastSuccessAt: now,
lastError: undefined,
};
}
export function markBackupFailure(error: string, now = Date.now()): void {
snapshot = {
...snapshot,
hasRun: true,
consecutiveFailures: snapshot.consecutiveFailures + 1,
lastFailureAt: now,
lastError: error,
};
}
export function getBackupHealthSnapshot(): BackupHealthSnapshot {
return { ...snapshot };
}
+8
View File
@@ -962,6 +962,14 @@ describe('configSchema automation', () => {
expect(result.automation.daily_briefing.prompt).toBe('Custom briefing prompt');
expect(result.automation.daily_briefing.model_tier).toBe('fast');
});
it('defaults heartbeat extended thresholds and checks', () => {
const result = configSchema.parse(baseConfig);
expect(result.automation.heartbeat.process_memory_threshold_mb).toBe(1500);
expect(result.automation.heartbeat.backup_failure_threshold).toBe(1);
expect(result.automation.heartbeat.checks).toContain('process_memory');
expect(result.automation.heartbeat.checks).toContain('backup');
});
});
describe('configSchema — intents', () => {
+4 -2
View File
@@ -302,18 +302,20 @@ const gmailSchema = z.object({
message: z.string().default('New email from {{from}}: {{subject}}\n\n{{snippet}}'),
}).optional();
const heartbeatCheckSchema = z.enum(['gateway', 'model', 'channels', 'memory', 'disk']);
const heartbeatCheckSchema = z.enum(['gateway', 'model', 'channels', 'memory', 'disk', 'process_memory', 'backup']);
const heartbeatSchema = z.object({
enabled: z.boolean().default(false),
interval: z.string().default('5m'),
checks: z.array(heartbeatCheckSchema).default(['gateway', 'model', 'channels', 'memory', 'disk']),
checks: z.array(heartbeatCheckSchema).default(['gateway', 'model', 'channels', 'memory', 'disk', 'process_memory', 'backup']),
notify: z.object({
channel: z.string().min(1),
peer: z.string().min(1),
}).optional(),
failure_threshold: z.number().min(1).max(10).default(2),
disk_threshold_mb: z.number().min(10).default(100),
process_memory_threshold_mb: z.number().min(64).default(1500),
backup_failure_threshold: z.number().min(1).max(10).default(1),
}).default({});
const gcalSchema = z.object({