feat(ops): add setup operator pack, heartbeat alert cooldown, and doctor strict mode
This commit is contained in:
@@ -8,6 +8,7 @@ function makeConfig(overrides?: Partial<HeartbeatConfig>): HeartbeatConfig {
|
||||
return {
|
||||
enabled: true,
|
||||
interval: '5m',
|
||||
notify_cooldown: '30m',
|
||||
checks: ['gateway', 'model', 'channels', 'memory', 'disk', 'process_memory', 'backup'],
|
||||
failure_threshold: 2,
|
||||
disk_threshold_mb: 100,
|
||||
@@ -227,6 +228,34 @@ describe('HeartbeatMonitor', () => {
|
||||
expect(mockSend).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it('suppresses repeat failure notifications inside notify cooldown after recovery', async () => {
|
||||
const mockSend = vi.fn().mockResolvedValue(undefined);
|
||||
const mockGet = vi.fn().mockReturnValue({ send: mockSend });
|
||||
|
||||
const deps = makeDeps({
|
||||
config: makeConfig({
|
||||
checks: ['model'],
|
||||
failure_threshold: 1,
|
||||
notify_cooldown: '1h',
|
||||
notify: { channel: 'telegram', peer: '123' },
|
||||
}),
|
||||
modelRouter: undefined,
|
||||
channelLookup: { get: mockGet },
|
||||
});
|
||||
monitor = new HeartbeatMonitor(deps);
|
||||
|
||||
await monitor.runChecks();
|
||||
expect(mockSend).toHaveBeenCalledTimes(1);
|
||||
|
||||
Object.assign(deps, { modelRouter: { getTier: () => 'default' } });
|
||||
await monitor.runChecks();
|
||||
expect(mockSend).toHaveBeenCalledTimes(2);
|
||||
|
||||
Object.assign(deps, { modelRouter: undefined });
|
||||
await monitor.runChecks();
|
||||
expect(mockSend).toHaveBeenCalledTimes(2);
|
||||
});
|
||||
|
||||
it('recovery notification sent when checks pass after failures', async () => {
|
||||
const mockSend = vi.fn().mockResolvedValue(undefined);
|
||||
const mockGet = vi.fn().mockReturnValue({ send: mockSend });
|
||||
|
||||
+56
-12
@@ -69,7 +69,11 @@ export class HeartbeatMonitor {
|
||||
private timer: ReturnType<typeof setInterval> | undefined;
|
||||
private lastResult: HeartbeatResult | undefined;
|
||||
private consecutiveFailures = 0;
|
||||
private notifiedFailure = false;
|
||||
private failureAlertSentForCurrentIncident = false;
|
||||
private failureAlertProcessedForCurrentIncident = false;
|
||||
private lastFailureNotificationAt = 0;
|
||||
private lastFailureSignature = '';
|
||||
private lastRecoveryNotificationAt = 0;
|
||||
private readonly deps: HeartbeatDeps;
|
||||
|
||||
constructor(deps: HeartbeatDeps) {
|
||||
@@ -172,28 +176,38 @@ export class HeartbeatMonitor {
|
||||
// Failure tracking and notification
|
||||
if (!healthy) {
|
||||
this.consecutiveFailures++;
|
||||
if (this.consecutiveFailures >= this.deps.config.failure_threshold && !this.notifiedFailure) {
|
||||
this.notifiedFailure = true;
|
||||
if (this.consecutiveFailures >= this.deps.config.failure_threshold && !this.failureAlertProcessedForCurrentIncident) {
|
||||
this.failureAlertProcessedForCurrentIncident = true;
|
||||
const failedChecks = checks.filter((c) => !c.healthy).map((c) => `${c.name}: ${c.message}`);
|
||||
await this.notify(`Heartbeat FAILING (${this.consecutiveFailures} consecutive failures):\n${failedChecks.join('\n')}`);
|
||||
const signature = failedChecks.join('|');
|
||||
const sent = await this.notifyFailureWithCooldown(
|
||||
`Heartbeat FAILING (${this.consecutiveFailures} consecutive failures):\n${failedChecks.join('\n')}`,
|
||||
signature,
|
||||
);
|
||||
this.failureAlertSentForCurrentIncident = sent;
|
||||
|
||||
auditLogger?.heartbeatFail({
|
||||
checks_failed: failedChecks,
|
||||
consecutive_failures: this.consecutiveFailures,
|
||||
threshold: this.deps.config.failure_threshold,
|
||||
});
|
||||
if (sent) {
|
||||
auditLogger?.heartbeatFail({
|
||||
checks_failed: failedChecks,
|
||||
consecutive_failures: this.consecutiveFailures,
|
||||
threshold: this.deps.config.failure_threshold,
|
||||
});
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (this.notifiedFailure) {
|
||||
if (this.failureAlertSentForCurrentIncident) {
|
||||
// Recovery notification
|
||||
await this.notify(`Heartbeat RECOVERED after ${this.consecutiveFailures} consecutive failure(s). All checks passing.`);
|
||||
await this.notifyRecoveryWithCooldown(
|
||||
`Heartbeat RECOVERED after ${this.consecutiveFailures} consecutive failure(s). All checks passing.`,
|
||||
);
|
||||
|
||||
auditLogger?.heartbeatRecover({
|
||||
consecutive_failures_before: this.consecutiveFailures,
|
||||
});
|
||||
}
|
||||
this.consecutiveFailures = 0;
|
||||
this.notifiedFailure = false;
|
||||
this.failureAlertSentForCurrentIncident = false;
|
||||
this.failureAlertProcessedForCurrentIncident = false;
|
||||
}
|
||||
|
||||
auditLogger?.heartbeatCycle({
|
||||
@@ -466,4 +480,34 @@ export class HeartbeatMonitor {
|
||||
console.error('HeartbeatMonitor: failed to send notification:', err);
|
||||
}
|
||||
}
|
||||
|
||||
private shouldNotifyByCooldown(lastAt: number, cooldownMs: number): boolean {
|
||||
return Date.now() - lastAt >= cooldownMs;
|
||||
}
|
||||
|
||||
private async notifyFailureWithCooldown(text: string, signature: string): Promise<boolean> {
|
||||
const cooldownMs = parseInterval(this.deps.config.notify_cooldown ?? '30m');
|
||||
const signatureChanged = signature !== this.lastFailureSignature;
|
||||
const cooldownPassed = this.shouldNotifyByCooldown(this.lastFailureNotificationAt, cooldownMs);
|
||||
if (!signatureChanged && !cooldownPassed) {
|
||||
return false;
|
||||
}
|
||||
|
||||
await this.notify(text);
|
||||
this.lastFailureNotificationAt = Date.now();
|
||||
this.lastFailureSignature = signature;
|
||||
return true;
|
||||
}
|
||||
|
||||
private async notifyRecoveryWithCooldown(text: string): Promise<boolean> {
|
||||
const cooldownMs = parseInterval(this.deps.config.notify_cooldown ?? '30m');
|
||||
const cooldownPassed = this.shouldNotifyByCooldown(this.lastRecoveryNotificationAt, cooldownMs);
|
||||
if (!cooldownPassed) {
|
||||
return false;
|
||||
}
|
||||
|
||||
await this.notify(text);
|
||||
this.lastRecoveryNotificationAt = Date.now();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user