feat(ops): add setup operator pack, heartbeat alert cooldown, and doctor strict mode
This commit is contained in:
@@ -8,6 +8,7 @@ function makeConfig(overrides?: Partial<HeartbeatConfig>): HeartbeatConfig {
|
||||
return {
|
||||
enabled: true,
|
||||
interval: '5m',
|
||||
notify_cooldown: '30m',
|
||||
checks: ['gateway', 'model', 'channels', 'memory', 'disk', 'process_memory', 'backup'],
|
||||
failure_threshold: 2,
|
||||
disk_threshold_mb: 100,
|
||||
@@ -227,6 +228,34 @@ describe('HeartbeatMonitor', () => {
|
||||
expect(mockSend).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it('suppresses repeat failure notifications inside notify cooldown after recovery', async () => {
|
||||
const mockSend = vi.fn().mockResolvedValue(undefined);
|
||||
const mockGet = vi.fn().mockReturnValue({ send: mockSend });
|
||||
|
||||
const deps = makeDeps({
|
||||
config: makeConfig({
|
||||
checks: ['model'],
|
||||
failure_threshold: 1,
|
||||
notify_cooldown: '1h',
|
||||
notify: { channel: 'telegram', peer: '123' },
|
||||
}),
|
||||
modelRouter: undefined,
|
||||
channelLookup: { get: mockGet },
|
||||
});
|
||||
monitor = new HeartbeatMonitor(deps);
|
||||
|
||||
await monitor.runChecks();
|
||||
expect(mockSend).toHaveBeenCalledTimes(1);
|
||||
|
||||
Object.assign(deps, { modelRouter: { getTier: () => 'default' } });
|
||||
await monitor.runChecks();
|
||||
expect(mockSend).toHaveBeenCalledTimes(2);
|
||||
|
||||
Object.assign(deps, { modelRouter: undefined });
|
||||
await monitor.runChecks();
|
||||
expect(mockSend).toHaveBeenCalledTimes(2);
|
||||
});
|
||||
|
||||
it('recovery notification sent when checks pass after failures', async () => {
|
||||
const mockSend = vi.fn().mockResolvedValue(undefined);
|
||||
const mockGet = vi.fn().mockReturnValue({ send: mockSend });
|
||||
|
||||
+56
-12
@@ -69,7 +69,11 @@ export class HeartbeatMonitor {
|
||||
private timer: ReturnType<typeof setInterval> | undefined;
|
||||
private lastResult: HeartbeatResult | undefined;
|
||||
private consecutiveFailures = 0;
|
||||
private notifiedFailure = false;
|
||||
private failureAlertSentForCurrentIncident = false;
|
||||
private failureAlertProcessedForCurrentIncident = false;
|
||||
private lastFailureNotificationAt = 0;
|
||||
private lastFailureSignature = '';
|
||||
private lastRecoveryNotificationAt = 0;
|
||||
private readonly deps: HeartbeatDeps;
|
||||
|
||||
constructor(deps: HeartbeatDeps) {
|
||||
@@ -172,28 +176,38 @@ export class HeartbeatMonitor {
|
||||
// Failure tracking and notification
|
||||
if (!healthy) {
|
||||
this.consecutiveFailures++;
|
||||
if (this.consecutiveFailures >= this.deps.config.failure_threshold && !this.notifiedFailure) {
|
||||
this.notifiedFailure = true;
|
||||
if (this.consecutiveFailures >= this.deps.config.failure_threshold && !this.failureAlertProcessedForCurrentIncident) {
|
||||
this.failureAlertProcessedForCurrentIncident = true;
|
||||
const failedChecks = checks.filter((c) => !c.healthy).map((c) => `${c.name}: ${c.message}`);
|
||||
await this.notify(`Heartbeat FAILING (${this.consecutiveFailures} consecutive failures):\n${failedChecks.join('\n')}`);
|
||||
const signature = failedChecks.join('|');
|
||||
const sent = await this.notifyFailureWithCooldown(
|
||||
`Heartbeat FAILING (${this.consecutiveFailures} consecutive failures):\n${failedChecks.join('\n')}`,
|
||||
signature,
|
||||
);
|
||||
this.failureAlertSentForCurrentIncident = sent;
|
||||
|
||||
auditLogger?.heartbeatFail({
|
||||
checks_failed: failedChecks,
|
||||
consecutive_failures: this.consecutiveFailures,
|
||||
threshold: this.deps.config.failure_threshold,
|
||||
});
|
||||
if (sent) {
|
||||
auditLogger?.heartbeatFail({
|
||||
checks_failed: failedChecks,
|
||||
consecutive_failures: this.consecutiveFailures,
|
||||
threshold: this.deps.config.failure_threshold,
|
||||
});
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (this.notifiedFailure) {
|
||||
if (this.failureAlertSentForCurrentIncident) {
|
||||
// Recovery notification
|
||||
await this.notify(`Heartbeat RECOVERED after ${this.consecutiveFailures} consecutive failure(s). All checks passing.`);
|
||||
await this.notifyRecoveryWithCooldown(
|
||||
`Heartbeat RECOVERED after ${this.consecutiveFailures} consecutive failure(s). All checks passing.`,
|
||||
);
|
||||
|
||||
auditLogger?.heartbeatRecover({
|
||||
consecutive_failures_before: this.consecutiveFailures,
|
||||
});
|
||||
}
|
||||
this.consecutiveFailures = 0;
|
||||
this.notifiedFailure = false;
|
||||
this.failureAlertSentForCurrentIncident = false;
|
||||
this.failureAlertProcessedForCurrentIncident = false;
|
||||
}
|
||||
|
||||
auditLogger?.heartbeatCycle({
|
||||
@@ -466,4 +480,34 @@ export class HeartbeatMonitor {
|
||||
console.error('HeartbeatMonitor: failed to send notification:', err);
|
||||
}
|
||||
}
|
||||
|
||||
private shouldNotifyByCooldown(lastAt: number, cooldownMs: number): boolean {
|
||||
return Date.now() - lastAt >= cooldownMs;
|
||||
}
|
||||
|
||||
private async notifyFailureWithCooldown(text: string, signature: string): Promise<boolean> {
|
||||
const cooldownMs = parseInterval(this.deps.config.notify_cooldown ?? '30m');
|
||||
const signatureChanged = signature !== this.lastFailureSignature;
|
||||
const cooldownPassed = this.shouldNotifyByCooldown(this.lastFailureNotificationAt, cooldownMs);
|
||||
if (!signatureChanged && !cooldownPassed) {
|
||||
return false;
|
||||
}
|
||||
|
||||
await this.notify(text);
|
||||
this.lastFailureNotificationAt = Date.now();
|
||||
this.lastFailureSignature = signature;
|
||||
return true;
|
||||
}
|
||||
|
||||
private async notifyRecoveryWithCooldown(text: string): Promise<boolean> {
|
||||
const cooldownMs = parseInterval(this.deps.config.notify_cooldown ?? '30m');
|
||||
const cooldownPassed = this.shouldNotifyByCooldown(this.lastRecoveryNotificationAt, cooldownMs);
|
||||
if (!cooldownPassed) {
|
||||
return false;
|
||||
}
|
||||
|
||||
await this.notify(text);
|
||||
this.lastRecoveryNotificationAt = Date.now();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
+17
-1
@@ -1,5 +1,5 @@
|
||||
import { describe, it, expect, afterEach } from 'vitest';
|
||||
import { runChecks, type CheckResult, type DoctorContext } from './doctor.js';
|
||||
import { computeDoctorExitCode, runChecks, type CheckResult, type DoctorContext } from './doctor.js';
|
||||
import { writeFileSync, mkdirSync, rmSync } from 'fs';
|
||||
import { join } from 'path';
|
||||
import { tmpdir } from 'os';
|
||||
@@ -11,6 +11,22 @@ describe('doctor checks', () => {
|
||||
try { rmSync(testDir, { recursive: true }); } catch {}
|
||||
});
|
||||
|
||||
it('computeDoctorExitCode returns 0 with warnings in non-strict mode', () => {
|
||||
const results: CheckResult[] = [
|
||||
{ status: 'pass', label: 'a' },
|
||||
{ status: 'warn', label: 'b' },
|
||||
];
|
||||
expect(computeDoctorExitCode(results, false)).toBe(0);
|
||||
});
|
||||
|
||||
it('computeDoctorExitCode returns 1 with warnings in strict mode', () => {
|
||||
const results: CheckResult[] = [
|
||||
{ status: 'pass', label: 'a' },
|
||||
{ status: 'warn', label: 'b' },
|
||||
];
|
||||
expect(computeDoctorExitCode(results, true)).toBe(1);
|
||||
});
|
||||
|
||||
it('reports PASS when config file exists and is valid', async () => {
|
||||
mkdirSync(testDir, { recursive: true });
|
||||
const configPath = join(testDir, 'config.yaml');
|
||||
|
||||
+18
-2
@@ -632,12 +632,25 @@ export async function runChecks(ctx: DoctorContext): Promise<CheckResult[]> {
|
||||
return results;
|
||||
}
|
||||
|
||||
export function computeDoctorExitCode(results: CheckResult[], strict: boolean): number {
|
||||
const failCount = results.filter((r) => r.status === 'fail').length;
|
||||
const warnCount = results.filter((r) => r.status === 'warn').length;
|
||||
if (failCount > 0) {
|
||||
return 1;
|
||||
}
|
||||
if (strict && warnCount > 0) {
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
export function registerDoctorCommand(program: Command): void {
|
||||
program
|
||||
.command('doctor')
|
||||
.description('Validate configuration and check system health')
|
||||
.option('-c, --config <path>', 'Config file path')
|
||||
.action(async (opts: { config?: string }) => {
|
||||
.option('--strict', 'Treat warnings as failures')
|
||||
.action(async (opts: { config?: string; strict?: boolean }) => {
|
||||
const configPath = opts.config ?? getConfigPath();
|
||||
const dataDir = getDataDir();
|
||||
|
||||
@@ -662,7 +675,10 @@ export function registerDoctorCommand(program: Command): void {
|
||||
};
|
||||
|
||||
console.log(`Results: ${counts.pass} passed, ${counts.fail} failed, ${counts.warn} warnings, ${counts.skip} skipped`);
|
||||
if (opts.strict && counts.warn > 0) {
|
||||
console.log('Strict mode enabled: warnings are treated as failures.');
|
||||
}
|
||||
|
||||
process.exit(counts.fail > 0 ? 1 : 0);
|
||||
process.exit(computeDoctorExitCode(results, Boolean(opts.strict)));
|
||||
});
|
||||
}
|
||||
|
||||
@@ -57,6 +57,30 @@ const GOOGLE_SERVICES: GoogleService[] = [
|
||||
];
|
||||
|
||||
export async function setupAutomation(p: Prompter, builder: ConfigBuilder): Promise<void> {
|
||||
const enableOperatorPack = await p.confirm(
|
||||
'Enable operator automation pack (scheduled backups + heartbeat alerts + daily briefing + MinIO sync)?',
|
||||
false,
|
||||
);
|
||||
if (enableOperatorPack) {
|
||||
const config = builder.build();
|
||||
const telegramPeer = config.telegram?.allowed_chat_ids?.[0];
|
||||
const defaultOutputChannel = telegramPeer ? 'telegram' : 'webchat';
|
||||
const defaultOutputPeer = telegramPeer ? String(telegramPeer) : 'operator';
|
||||
|
||||
const backupSchedule = await p.ask('Backup cron schedule', '0 2 * * *');
|
||||
const dailyBriefingSchedule = await p.ask('Daily briefing cron schedule', '0 8 * * *');
|
||||
const enableMinioSync = await p.confirm('Include default MinIO sync task?', true);
|
||||
|
||||
builder.applyOperatorPack({
|
||||
outputChannel: defaultOutputChannel,
|
||||
outputPeer: defaultOutputPeer,
|
||||
backupSchedule,
|
||||
dailyBriefingSchedule,
|
||||
enableMinioSync,
|
||||
});
|
||||
p.println(`✓ Operator pack enabled (alerts routed to ${defaultOutputChannel}/${defaultOutputPeer})`);
|
||||
}
|
||||
|
||||
const cron = await p.confirm('Enable cron scheduler?', false);
|
||||
if (cron) {
|
||||
builder.setCronEnabled();
|
||||
|
||||
@@ -84,4 +84,23 @@ describe('ConfigBuilder', () => {
|
||||
const obj = builder.build();
|
||||
expect(obj.server.token).toBe('my-secret-token');
|
||||
});
|
||||
|
||||
it('applies operator automation pack defaults', () => {
|
||||
const builder = new ConfigBuilder();
|
||||
builder.applyOperatorPack({
|
||||
outputChannel: 'telegram',
|
||||
outputPeer: '123',
|
||||
backupSchedule: '0 2 * * *',
|
||||
dailyBriefingSchedule: '0 8 * * *',
|
||||
enableMinioSync: true,
|
||||
});
|
||||
|
||||
const obj = builder.build();
|
||||
expect(obj.backup?.enabled).toBe(true);
|
||||
expect(obj.backup?.schedule).toBe('0 2 * * *');
|
||||
expect(obj.backup?.run_on_start).toBe(true);
|
||||
expect((obj.automation as Record<string, unknown>)?.heartbeat).toBeDefined();
|
||||
expect((obj.automation as Record<string, unknown>)?.daily_briefing).toBeDefined();
|
||||
expect((obj.automation as Record<string, unknown>)?.minio_sync).toBeDefined();
|
||||
});
|
||||
});
|
||||
|
||||
@@ -43,9 +43,23 @@ export interface SetupConfig {
|
||||
gtasks?: { enabled?: boolean };
|
||||
heartbeat?: { enabled?: boolean };
|
||||
} & Record<string, unknown>;
|
||||
backup?: {
|
||||
enabled?: boolean;
|
||||
schedule?: string;
|
||||
run_on_start?: boolean;
|
||||
notify?: { channel: string; peer: string };
|
||||
} & Record<string, unknown>;
|
||||
[key: string]: unknown;
|
||||
}
|
||||
|
||||
interface OperatorPackOptions {
|
||||
outputChannel: string;
|
||||
outputPeer: string;
|
||||
backupSchedule: string;
|
||||
dailyBriefingSchedule: string;
|
||||
enableMinioSync?: boolean;
|
||||
}
|
||||
|
||||
export class ConfigBuilder {
|
||||
private config: SetupConfig;
|
||||
|
||||
@@ -187,6 +201,54 @@ export class ConfigBuilder {
|
||||
this.config.automation = automation;
|
||||
}
|
||||
|
||||
applyOperatorPack(options: OperatorPackOptions): void {
|
||||
const automation = (this.config.automation ?? {}) as Record<string, unknown>;
|
||||
const backup = (this.config.backup ?? {}) as Record<string, unknown>;
|
||||
|
||||
backup.enabled = true;
|
||||
backup.schedule = options.backupSchedule;
|
||||
backup.run_on_start = true;
|
||||
backup.notify = { channel: options.outputChannel, peer: options.outputPeer };
|
||||
|
||||
automation.heartbeat = {
|
||||
enabled: true,
|
||||
notify: { channel: options.outputChannel, peer: options.outputPeer },
|
||||
interval: '5m',
|
||||
failure_threshold: 2,
|
||||
notify_cooldown: '30m',
|
||||
};
|
||||
|
||||
automation.daily_briefing = {
|
||||
enabled: true,
|
||||
schedule: options.dailyBriefingSchedule,
|
||||
output: { channel: options.outputChannel, peer: options.outputPeer },
|
||||
dedupe_per_local_day: true,
|
||||
model_tier: 'fast',
|
||||
};
|
||||
|
||||
if (options.enableMinioSync ?? true) {
|
||||
automation.minio_sync = {
|
||||
enabled: true,
|
||||
interval: '6h',
|
||||
run_on_start: true,
|
||||
notify: { channel: options.outputChannel, peer: options.outputPeer },
|
||||
tasks: [
|
||||
{
|
||||
prefix: 'knowledge/',
|
||||
namespace_base: 'global/knowledge/minio',
|
||||
mode: 'append',
|
||||
max_objects: 20,
|
||||
max_chars_per_object: 8000,
|
||||
force: false,
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
this.config.automation = automation;
|
||||
this.config.backup = backup;
|
||||
}
|
||||
|
||||
build(): SetupConfig {
|
||||
return structuredClone(this.config) as SetupConfig;
|
||||
}
|
||||
|
||||
@@ -1002,6 +1002,7 @@ describe('configSchema automation', () => {
|
||||
|
||||
it('defaults heartbeat extended thresholds and checks', () => {
|
||||
const result = configSchema.parse(baseConfig);
|
||||
expect(result.automation.heartbeat.notify_cooldown).toBe('30m');
|
||||
expect(result.automation.heartbeat.process_memory_threshold_mb).toBe(1500);
|
||||
expect(result.automation.heartbeat.backup_failure_threshold).toBe(1);
|
||||
expect(result.automation.heartbeat.provider_error_rate_threshold).toBe(0.5);
|
||||
|
||||
@@ -308,6 +308,7 @@ const heartbeatCheckSchema = z.enum(['gateway', 'model', 'channels', 'memory', '
|
||||
const heartbeatSchema = z.object({
|
||||
enabled: z.boolean().default(false),
|
||||
interval: z.string().default('5m'),
|
||||
notify_cooldown: z.string().default('30m'),
|
||||
checks: z.array(heartbeatCheckSchema).default(['gateway', 'model', 'channels', 'memory', 'disk', 'process_memory', 'backup', 'provider_errors']),
|
||||
notify: z.object({
|
||||
channel: z.string().min(1),
|
||||
|
||||
Reference in New Issue
Block a user