feat(ops): add setup operator pack, heartbeat alert cooldown, and doctor strict mode

This commit is contained in:
William Valentin
2026-02-16 14:57:56 -08:00
parent 030fb13a26
commit 3210e75c94
12 changed files with 274 additions and 17 deletions
+56 -12
View File
@@ -69,7 +69,11 @@ export class HeartbeatMonitor {
private timer: ReturnType<typeof setInterval> | undefined;
private lastResult: HeartbeatResult | undefined;
private consecutiveFailures = 0;
private notifiedFailure = false;
private failureAlertSentForCurrentIncident = false;
private failureAlertProcessedForCurrentIncident = false;
private lastFailureNotificationAt = 0;
private lastFailureSignature = '';
private lastRecoveryNotificationAt = 0;
private readonly deps: HeartbeatDeps;
constructor(deps: HeartbeatDeps) {
@@ -172,28 +176,38 @@ export class HeartbeatMonitor {
// Failure tracking and notification
if (!healthy) {
this.consecutiveFailures++;
if (this.consecutiveFailures >= this.deps.config.failure_threshold && !this.notifiedFailure) {
this.notifiedFailure = true;
if (this.consecutiveFailures >= this.deps.config.failure_threshold && !this.failureAlertProcessedForCurrentIncident) {
this.failureAlertProcessedForCurrentIncident = true;
const failedChecks = checks.filter((c) => !c.healthy).map((c) => `${c.name}: ${c.message}`);
await this.notify(`Heartbeat FAILING (${this.consecutiveFailures} consecutive failures):\n${failedChecks.join('\n')}`);
const signature = failedChecks.join('|');
const sent = await this.notifyFailureWithCooldown(
`Heartbeat FAILING (${this.consecutiveFailures} consecutive failures):\n${failedChecks.join('\n')}`,
signature,
);
this.failureAlertSentForCurrentIncident = sent;
auditLogger?.heartbeatFail({
checks_failed: failedChecks,
consecutive_failures: this.consecutiveFailures,
threshold: this.deps.config.failure_threshold,
});
if (sent) {
auditLogger?.heartbeatFail({
checks_failed: failedChecks,
consecutive_failures: this.consecutiveFailures,
threshold: this.deps.config.failure_threshold,
});
}
}
} else {
if (this.notifiedFailure) {
if (this.failureAlertSentForCurrentIncident) {
// Recovery notification
await this.notify(`Heartbeat RECOVERED after ${this.consecutiveFailures} consecutive failure(s). All checks passing.`);
await this.notifyRecoveryWithCooldown(
`Heartbeat RECOVERED after ${this.consecutiveFailures} consecutive failure(s). All checks passing.`,
);
auditLogger?.heartbeatRecover({
consecutive_failures_before: this.consecutiveFailures,
});
}
this.consecutiveFailures = 0;
this.notifiedFailure = false;
this.failureAlertSentForCurrentIncident = false;
this.failureAlertProcessedForCurrentIncident = false;
}
auditLogger?.heartbeatCycle({
@@ -466,4 +480,34 @@ export class HeartbeatMonitor {
console.error('HeartbeatMonitor: failed to send notification:', err);
}
}
private shouldNotifyByCooldown(lastAt: number, cooldownMs: number): boolean {
return Date.now() - lastAt >= cooldownMs;
}
private async notifyFailureWithCooldown(text: string, signature: string): Promise<boolean> {
const cooldownMs = parseInterval(this.deps.config.notify_cooldown ?? '30m');
const signatureChanged = signature !== this.lastFailureSignature;
const cooldownPassed = this.shouldNotifyByCooldown(this.lastFailureNotificationAt, cooldownMs);
if (!signatureChanged && !cooldownPassed) {
return false;
}
await this.notify(text);
this.lastFailureNotificationAt = Date.now();
this.lastFailureSignature = signature;
return true;
}
private async notifyRecoveryWithCooldown(text: string): Promise<boolean> {
const cooldownMs = parseInterval(this.deps.config.notify_cooldown ?? '30m');
const cooldownPassed = this.shouldNotifyByCooldown(this.lastRecoveryNotificationAt, cooldownMs);
if (!cooldownPassed) {
return false;
}
await this.notify(text);
this.lastRecoveryNotificationAt = Date.now();
return true;
}
}