feat(heartbeat): add process memory and backup health checks

This commit is contained in:
William Valentin
2026-02-16 13:50:39 -08:00
parent 8684c3a07d
commit 07340ff0af
11 changed files with 282 additions and 8 deletions
+76
View File
@@ -2,6 +2,7 @@ import { statfsSync, accessSync, constants as fsConstants } from 'fs';
import { request } from 'http';
import type { HeartbeatConfig, HeartbeatCheck } from '../config/schema.js';
import type { ChannelAdapter, OutboundMessage } from '../channels/types.js';
import { getBackupHealthSnapshot, type BackupHealthSnapshot } from '../backup/index.js';
import { auditLogger } from '../audit/index.js';
/** Result of a single health check. */
@@ -38,6 +39,8 @@ export interface HeartbeatDeps {
memoryDir: string | undefined;
dataDir: string;
channelLookup: ChannelLookup;
processMemoryUsageMb?: () => number;
backupHealthProvider?: () => BackupHealthSnapshot;
}
/**
@@ -126,6 +129,12 @@ export class HeartbeatMonitor {
case 'disk':
result = this.checkDisk(start);
break;
case 'process_memory':
result = this.checkProcessMemory(start);
break;
case 'backup':
result = this.checkBackup(start);
break;
default:
result = { name: check, healthy: false, message: `Unknown check: ${check}`, durationMs: Date.now() - start };
}
@@ -312,6 +321,73 @@ export class HeartbeatMonitor {
}
}
private checkProcessMemory(start: number): CheckResult {
try {
const usageMb = this.deps.processMemoryUsageMb
? this.deps.processMemoryUsageMb()
: Math.round(process.memoryUsage().rss / (1024 * 1024));
const thresholdMb = this.deps.config.process_memory_threshold_mb;
const healthy = usageMb <= thresholdMb;
return {
name: 'process_memory',
healthy,
message: healthy
? `Process RSS ${usageMb} MB (threshold: ${thresholdMb} MB)`
: `High memory usage: ${usageMb} MB RSS (threshold: ${thresholdMb} MB)`,
durationMs: Date.now() - start,
};
} catch (err) {
return {
name: 'process_memory',
healthy: false,
message: err instanceof Error ? err.message : 'Failed to check process memory usage',
durationMs: Date.now() - start,
};
}
}
private checkBackup(start: number): CheckResult {
try {
const snapshot = this.deps.backupHealthProvider
? this.deps.backupHealthProvider()
: getBackupHealthSnapshot();
if (!snapshot.enabled) {
return { name: 'backup', healthy: true, message: 'Backup scheduler disabled', durationMs: Date.now() - start };
}
const threshold = this.deps.config.backup_failure_threshold;
const healthy = snapshot.consecutiveFailures < threshold;
if (!snapshot.hasRun) {
return { name: 'backup', healthy: true, message: 'No backup runs recorded yet', durationMs: Date.now() - start };
}
if (healthy) {
const successSuffix = snapshot.lastSuccessAt ? `, last success ${new Date(snapshot.lastSuccessAt).toISOString()}` : '';
return {
name: 'backup',
healthy: true,
message: `Consecutive failures: ${snapshot.consecutiveFailures}/${threshold}${successSuffix}`,
durationMs: Date.now() - start,
};
}
return {
name: 'backup',
healthy: false,
message: `Backup failing (${snapshot.consecutiveFailures} consecutive failures, threshold: ${threshold})${snapshot.lastError ? `${snapshot.lastError}` : ''}`,
durationMs: Date.now() - start,
};
} catch (err) {
return {
name: 'backup',
healthy: false,
message: err instanceof Error ? err.message : 'Failed to read backup health state',
durationMs: Date.now() - start,
};
}
}
// ── Notification ───────────────────────────────────────────────
private async notify(text: string): Promise<void> {