feat(heartbeat): add process memory and backup health checks
This commit is contained in:
@@ -8,9 +8,11 @@ function makeConfig(overrides?: Partial<HeartbeatConfig>): HeartbeatConfig {
|
||||
return {
|
||||
enabled: true,
|
||||
interval: '5m',
|
||||
checks: ['gateway', 'model', 'channels', 'memory', 'disk'],
|
||||
checks: ['gateway', 'model', 'channels', 'memory', 'disk', 'process_memory', 'backup'],
|
||||
failure_threshold: 2,
|
||||
disk_threshold_mb: 100,
|
||||
process_memory_threshold_mb: 1500,
|
||||
backup_failure_threshold: 1,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
@@ -29,6 +31,12 @@ function makeDeps(overrides?: Partial<HeartbeatDeps>): HeartbeatDeps {
|
||||
memoryDir: '/tmp/flynn-test-memory',
|
||||
dataDir: '/tmp',
|
||||
channelLookup: { get: vi.fn() },
|
||||
processMemoryUsageMb: () => 256,
|
||||
backupHealthProvider: () => ({
|
||||
enabled: false,
|
||||
hasRun: false,
|
||||
consecutiveFailures: 0,
|
||||
}),
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
@@ -436,4 +444,73 @@ describe('HeartbeatMonitor', () => {
|
||||
expect(check.healthy).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe('process_memory check', () => {
|
||||
it('passes when RSS is below threshold', async () => {
|
||||
const deps = makeDeps({
|
||||
config: makeConfig({ checks: ['process_memory'], process_memory_threshold_mb: 512 }),
|
||||
processMemoryUsageMb: () => 200,
|
||||
});
|
||||
monitor = new HeartbeatMonitor(deps);
|
||||
|
||||
const result = await monitor.runChecks();
|
||||
const check = result.checks.find((c) => c.name === 'process_memory');
|
||||
if (!check) {throw new Error('Expected process_memory check result');}
|
||||
expect(check.healthy).toBe(true);
|
||||
});
|
||||
|
||||
it('fails when RSS is above threshold', async () => {
|
||||
const deps = makeDeps({
|
||||
config: makeConfig({ checks: ['process_memory'], process_memory_threshold_mb: 128 }),
|
||||
processMemoryUsageMb: () => 512,
|
||||
});
|
||||
monitor = new HeartbeatMonitor(deps);
|
||||
|
||||
const result = await monitor.runChecks();
|
||||
const check = result.checks.find((c) => c.name === 'process_memory');
|
||||
if (!check) {throw new Error('Expected process_memory check result');}
|
||||
expect(check.healthy).toBe(false);
|
||||
expect(check.message).toContain('High memory usage');
|
||||
});
|
||||
});
|
||||
|
||||
describe('backup check', () => {
|
||||
it('passes when backup is disabled', async () => {
|
||||
const deps = makeDeps({
|
||||
config: makeConfig({ checks: ['backup'] }),
|
||||
backupHealthProvider: () => ({
|
||||
enabled: false,
|
||||
hasRun: false,
|
||||
consecutiveFailures: 0,
|
||||
}),
|
||||
});
|
||||
monitor = new HeartbeatMonitor(deps);
|
||||
|
||||
const result = await monitor.runChecks();
|
||||
const check = result.checks.find((c) => c.name === 'backup');
|
||||
if (!check) {throw new Error('Expected backup check result');}
|
||||
expect(check.healthy).toBe(true);
|
||||
expect(check.message).toContain('disabled');
|
||||
});
|
||||
|
||||
it('fails when backup consecutive failures exceed threshold', async () => {
|
||||
const deps = makeDeps({
|
||||
config: makeConfig({ checks: ['backup'], backup_failure_threshold: 2 }),
|
||||
backupHealthProvider: () => ({
|
||||
enabled: true,
|
||||
hasRun: true,
|
||||
consecutiveFailures: 3,
|
||||
lastError: 'minio unavailable',
|
||||
}),
|
||||
});
|
||||
monitor = new HeartbeatMonitor(deps);
|
||||
|
||||
const result = await monitor.runChecks();
|
||||
const check = result.checks.find((c) => c.name === 'backup');
|
||||
if (!check) {throw new Error('Expected backup check result');}
|
||||
expect(check.healthy).toBe(false);
|
||||
expect(check.message).toContain('Backup failing');
|
||||
expect(check.message).toContain('minio unavailable');
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
@@ -2,6 +2,7 @@ import { statfsSync, accessSync, constants as fsConstants } from 'fs';
|
||||
import { request } from 'http';
|
||||
import type { HeartbeatConfig, HeartbeatCheck } from '../config/schema.js';
|
||||
import type { ChannelAdapter, OutboundMessage } from '../channels/types.js';
|
||||
import { getBackupHealthSnapshot, type BackupHealthSnapshot } from '../backup/index.js';
|
||||
import { auditLogger } from '../audit/index.js';
|
||||
|
||||
/** Result of a single health check. */
|
||||
@@ -38,6 +39,8 @@ export interface HeartbeatDeps {
|
||||
memoryDir: string | undefined;
|
||||
dataDir: string;
|
||||
channelLookup: ChannelLookup;
|
||||
processMemoryUsageMb?: () => number;
|
||||
backupHealthProvider?: () => BackupHealthSnapshot;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -126,6 +129,12 @@ export class HeartbeatMonitor {
|
||||
case 'disk':
|
||||
result = this.checkDisk(start);
|
||||
break;
|
||||
case 'process_memory':
|
||||
result = this.checkProcessMemory(start);
|
||||
break;
|
||||
case 'backup':
|
||||
result = this.checkBackup(start);
|
||||
break;
|
||||
default:
|
||||
result = { name: check, healthy: false, message: `Unknown check: ${check}`, durationMs: Date.now() - start };
|
||||
}
|
||||
@@ -312,6 +321,73 @@ export class HeartbeatMonitor {
|
||||
}
|
||||
}
|
||||
|
||||
private checkProcessMemory(start: number): CheckResult {
|
||||
try {
|
||||
const usageMb = this.deps.processMemoryUsageMb
|
||||
? this.deps.processMemoryUsageMb()
|
||||
: Math.round(process.memoryUsage().rss / (1024 * 1024));
|
||||
const thresholdMb = this.deps.config.process_memory_threshold_mb;
|
||||
const healthy = usageMb <= thresholdMb;
|
||||
|
||||
return {
|
||||
name: 'process_memory',
|
||||
healthy,
|
||||
message: healthy
|
||||
? `Process RSS ${usageMb} MB (threshold: ${thresholdMb} MB)`
|
||||
: `High memory usage: ${usageMb} MB RSS (threshold: ${thresholdMb} MB)`,
|
||||
durationMs: Date.now() - start,
|
||||
};
|
||||
} catch (err) {
|
||||
return {
|
||||
name: 'process_memory',
|
||||
healthy: false,
|
||||
message: err instanceof Error ? err.message : 'Failed to check process memory usage',
|
||||
durationMs: Date.now() - start,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private checkBackup(start: number): CheckResult {
|
||||
try {
|
||||
const snapshot = this.deps.backupHealthProvider
|
||||
? this.deps.backupHealthProvider()
|
||||
: getBackupHealthSnapshot();
|
||||
if (!snapshot.enabled) {
|
||||
return { name: 'backup', healthy: true, message: 'Backup scheduler disabled', durationMs: Date.now() - start };
|
||||
}
|
||||
|
||||
const threshold = this.deps.config.backup_failure_threshold;
|
||||
const healthy = snapshot.consecutiveFailures < threshold;
|
||||
if (!snapshot.hasRun) {
|
||||
return { name: 'backup', healthy: true, message: 'No backup runs recorded yet', durationMs: Date.now() - start };
|
||||
}
|
||||
|
||||
if (healthy) {
|
||||
const successSuffix = snapshot.lastSuccessAt ? `, last success ${new Date(snapshot.lastSuccessAt).toISOString()}` : '';
|
||||
return {
|
||||
name: 'backup',
|
||||
healthy: true,
|
||||
message: `Consecutive failures: ${snapshot.consecutiveFailures}/${threshold}${successSuffix}`,
|
||||
durationMs: Date.now() - start,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
name: 'backup',
|
||||
healthy: false,
|
||||
message: `Backup failing (${snapshot.consecutiveFailures} consecutive failures, threshold: ${threshold})${snapshot.lastError ? ` — ${snapshot.lastError}` : ''}`,
|
||||
durationMs: Date.now() - start,
|
||||
};
|
||||
} catch (err) {
|
||||
return {
|
||||
name: 'backup',
|
||||
healthy: false,
|
||||
message: err instanceof Error ? err.message : 'Failed to read backup health state',
|
||||
durationMs: Date.now() - start,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// ── Notification ───────────────────────────────────────────────
|
||||
|
||||
private async notify(text: string): Promise<void> {
|
||||
|
||||
Reference in New Issue
Block a user