feat(ops): add setup operator pack, heartbeat alert cooldown, and doctor strict mode

This commit is contained in:
William Valentin
2026-02-16 14:57:56 -08:00
parent 030fb13a26
commit 3210e75c94
12 changed files with 274 additions and 17 deletions
+29
View File
@@ -8,6 +8,7 @@ function makeConfig(overrides?: Partial<HeartbeatConfig>): HeartbeatConfig {
return {
enabled: true,
interval: '5m',
notify_cooldown: '30m',
checks: ['gateway', 'model', 'channels', 'memory', 'disk', 'process_memory', 'backup'],
failure_threshold: 2,
disk_threshold_mb: 100,
@@ -227,6 +228,34 @@ describe('HeartbeatMonitor', () => {
expect(mockSend).toHaveBeenCalledTimes(1);
});
it('suppresses repeat failure notifications inside notify cooldown after recovery', async () => {
const mockSend = vi.fn().mockResolvedValue(undefined);
const mockGet = vi.fn().mockReturnValue({ send: mockSend });
const deps = makeDeps({
config: makeConfig({
checks: ['model'],
failure_threshold: 1,
notify_cooldown: '1h',
notify: { channel: 'telegram', peer: '123' },
}),
modelRouter: undefined,
channelLookup: { get: mockGet },
});
monitor = new HeartbeatMonitor(deps);
await monitor.runChecks();
expect(mockSend).toHaveBeenCalledTimes(1);
Object.assign(deps, { modelRouter: { getTier: () => 'default' } });
await monitor.runChecks();
expect(mockSend).toHaveBeenCalledTimes(2);
Object.assign(deps, { modelRouter: undefined });
await monitor.runChecks();
expect(mockSend).toHaveBeenCalledTimes(2);
});
it('recovery notification sent when checks pass after failures', async () => {
const mockSend = vi.fn().mockResolvedValue(undefined);
const mockGet = vi.fn().mockReturnValue({ send: mockSend });
+56 -12
View File
@@ -69,7 +69,11 @@ export class HeartbeatMonitor {
private timer: ReturnType<typeof setInterval> | undefined;
private lastResult: HeartbeatResult | undefined;
private consecutiveFailures = 0;
private notifiedFailure = false;
private failureAlertSentForCurrentIncident = false;
private failureAlertProcessedForCurrentIncident = false;
private lastFailureNotificationAt = 0;
private lastFailureSignature = '';
private lastRecoveryNotificationAt = 0;
private readonly deps: HeartbeatDeps;
constructor(deps: HeartbeatDeps) {
@@ -172,28 +176,38 @@ export class HeartbeatMonitor {
// Failure tracking and notification
if (!healthy) {
this.consecutiveFailures++;
if (this.consecutiveFailures >= this.deps.config.failure_threshold && !this.notifiedFailure) {
this.notifiedFailure = true;
if (this.consecutiveFailures >= this.deps.config.failure_threshold && !this.failureAlertProcessedForCurrentIncident) {
this.failureAlertProcessedForCurrentIncident = true;
const failedChecks = checks.filter((c) => !c.healthy).map((c) => `${c.name}: ${c.message}`);
await this.notify(`Heartbeat FAILING (${this.consecutiveFailures} consecutive failures):\n${failedChecks.join('\n')}`);
const signature = failedChecks.join('|');
const sent = await this.notifyFailureWithCooldown(
`Heartbeat FAILING (${this.consecutiveFailures} consecutive failures):\n${failedChecks.join('\n')}`,
signature,
);
this.failureAlertSentForCurrentIncident = sent;
auditLogger?.heartbeatFail({
checks_failed: failedChecks,
consecutive_failures: this.consecutiveFailures,
threshold: this.deps.config.failure_threshold,
});
if (sent) {
auditLogger?.heartbeatFail({
checks_failed: failedChecks,
consecutive_failures: this.consecutiveFailures,
threshold: this.deps.config.failure_threshold,
});
}
}
} else {
if (this.notifiedFailure) {
if (this.failureAlertSentForCurrentIncident) {
// Recovery notification
await this.notify(`Heartbeat RECOVERED after ${this.consecutiveFailures} consecutive failure(s). All checks passing.`);
await this.notifyRecoveryWithCooldown(
`Heartbeat RECOVERED after ${this.consecutiveFailures} consecutive failure(s). All checks passing.`,
);
auditLogger?.heartbeatRecover({
consecutive_failures_before: this.consecutiveFailures,
});
}
this.consecutiveFailures = 0;
this.notifiedFailure = false;
this.failureAlertSentForCurrentIncident = false;
this.failureAlertProcessedForCurrentIncident = false;
}
auditLogger?.heartbeatCycle({
@@ -466,4 +480,34 @@ export class HeartbeatMonitor {
console.error('HeartbeatMonitor: failed to send notification:', err);
}
}
private shouldNotifyByCooldown(lastAt: number, cooldownMs: number): boolean {
return Date.now() - lastAt >= cooldownMs;
}
private async notifyFailureWithCooldown(text: string, signature: string): Promise<boolean> {
const cooldownMs = parseInterval(this.deps.config.notify_cooldown ?? '30m');
const signatureChanged = signature !== this.lastFailureSignature;
const cooldownPassed = this.shouldNotifyByCooldown(this.lastFailureNotificationAt, cooldownMs);
if (!signatureChanged && !cooldownPassed) {
return false;
}
await this.notify(text);
this.lastFailureNotificationAt = Date.now();
this.lastFailureSignature = signature;
return true;
}
private async notifyRecoveryWithCooldown(text: string): Promise<boolean> {
const cooldownMs = parseInterval(this.deps.config.notify_cooldown ?? '30m');
const cooldownPassed = this.shouldNotifyByCooldown(this.lastRecoveryNotificationAt, cooldownMs);
if (!cooldownPassed) {
return false;
}
await this.notify(text);
this.lastRecoveryNotificationAt = Date.now();
return true;
}
}