feat(ops): add setup operator pack, heartbeat alert cooldown, and doctor strict mode

This commit is contained in:
William Valentin
2026-02-16 14:57:56 -08:00
parent 030fb13a26
commit 3210e75c94
12 changed files with 274 additions and 17 deletions
+24 -1
View File
@@ -89,6 +89,9 @@ flynn send "What's the weather in London?"
# Check system health
flynn doctor --config ~/.config/flynn/config.yaml
# Treat warnings as failures (useful in CI)
flynn doctor --strict
# Show current config (secrets masked)
flynn config
@@ -705,6 +708,7 @@ automation:
heartbeat:
enabled: true
interval: "5m" # Check every 5 minutes
notify_cooldown: "30m" # Suppress repeated alerts inside cooldown window
checks: [gateway, model, channels, memory, disk, process_memory, backup, provider_errors]
notify:
channel: telegram
@@ -731,6 +735,7 @@ automation:
| `provider_errors` | Model provider error rates stay below threshold |
The monitor sends a notification when failures reach the configured threshold and a recovery notification when all checks pass again.
Repeated failure/recovery notifications are throttled by `notify_cooldown`.
### Heartbeat Config Fields
@@ -738,7 +743,8 @@ The monitor sends a notification when failures reach the configured threshold an
|-------|----------|-------------|
| `enabled` | no | Enable the heartbeat monitor (default: `false`) |
| `interval` | no | Check interval: `60s`, `5m`, `1h` (default: `5m`) |
| `checks` | no | Which checks to run (default: all five) |
| `notify_cooldown` | no | Minimum time between repeated heartbeat notifications of the same type (default: `30m`) |
| `checks` | no | Which checks to run (default: `gateway, model, channels, memory, disk, process_memory, backup, provider_errors`) |
| `notify.channel` | no | Channel to send failure/recovery notifications |
| `notify.peer` | no | Peer/chat ID for notifications |
| `failure_threshold` | no | Consecutive failures before notifying (default: `2`) |
@@ -748,6 +754,23 @@ The monitor sends a notification when failures reach the configured threshold an
| `provider_error_rate_threshold` | no | Error-rate threshold (0..1) for `provider_errors` check (default: `0.5`) |
| `provider_error_min_calls` | no | Minimum provider calls before applying error-rate threshold (default: `5`) |
### Common Schedules and Routing
- Nightly backups to Telegram alerts:
- `backup.schedule: "0 2 * * *"`
- `backup.notify.channel: telegram`
- Weekday daily briefing to Discord:
- `automation.daily_briefing.schedule: "0 8 * * 1-5"`
- `automation.daily_briefing.output.channel: discord`
- High-frequency heartbeat to Slack:
- `automation.heartbeat.interval: "2m"`
- `automation.heartbeat.notify.channel: slack`
- MinIO sync every 6h to WebChat:
- `automation.minio_sync.interval: "6h"`
- `automation.minio_sync.notify.channel: webchat`
`flynn setup` now includes an Operator Pack option in Automation that preconfigures scheduled backups, heartbeat alerts, a daily briefing, and a default MinIO sync task.
## Gmail Pub/Sub Watcher
Monitor a Gmail inbox and forward new messages into the agent pipeline.
+1
View File
@@ -317,6 +317,7 @@ hooks:
# heartbeat:
# enabled: false
# interval: "5m"
# notify_cooldown: "30m"
# checks: [gateway, model, channels, memory, disk, process_memory, backup, provider_errors]
# notify:
# channel: telegram
+22 -1
View File
@@ -189,6 +189,27 @@
],
"test_status": "pnpm test:run src/cli/minioExtractors.test.ts src/cli/doctor.test.ts + pnpm typecheck passing"
},
"operator-pack-heartbeat-throttle-and-doctor-strict": {
"status": "completed",
"date": "2026-02-16",
"updated": "2026-02-16",
"summary": "Implemented operator-focused hardening and onboarding polish: added a setup Automation operator-pack path that preconfigures scheduled backups, heartbeat alerts, daily briefing, and default MinIO sync; added heartbeat notification throttling via `automation.heartbeat.notify_cooldown`; and added `flynn doctor --strict` to treat warnings as failures. Updated docs/default config examples accordingly.",
"files_modified": [
"src/cli/setup/config.ts",
"src/cli/setup/config.test.ts",
"src/cli/setup/automation.ts",
"src/automation/heartbeat.ts",
"src/automation/heartbeat.test.ts",
"src/config/schema.ts",
"src/config/schema.test.ts",
"src/cli/doctor.ts",
"src/cli/doctor.test.ts",
"config/default.yaml",
"README.md",
"docs/plans/state.json"
],
"test_status": "pnpm test:run src/cli/setup/config.test.ts src/automation/heartbeat.test.ts src/config/schema.test.ts src/cli/doctor.test.ts + pnpm typecheck passing"
},
"backup-session-summary-audit-trail": {
"status": "completed",
"date": "2026-02-16",
@@ -3473,7 +3494,7 @@
}
},
"overall_progress": {
"total_test_count": 1859,
"total_test_count": 1863,
"all_tests_passing": true,
"p0_completion": "3/3 (100%)",
"p1_completion": "4/4 (100%)",
+29
View File
@@ -8,6 +8,7 @@ function makeConfig(overrides?: Partial<HeartbeatConfig>): HeartbeatConfig {
return {
enabled: true,
interval: '5m',
notify_cooldown: '30m',
checks: ['gateway', 'model', 'channels', 'memory', 'disk', 'process_memory', 'backup'],
failure_threshold: 2,
disk_threshold_mb: 100,
@@ -227,6 +228,34 @@ describe('HeartbeatMonitor', () => {
expect(mockSend).toHaveBeenCalledTimes(1);
});
it('suppresses repeat failure notifications inside notify cooldown after recovery', async () => {
const mockSend = vi.fn().mockResolvedValue(undefined);
const mockGet = vi.fn().mockReturnValue({ send: mockSend });
const deps = makeDeps({
config: makeConfig({
checks: ['model'],
failure_threshold: 1,
notify_cooldown: '1h',
notify: { channel: 'telegram', peer: '123' },
}),
modelRouter: undefined,
channelLookup: { get: mockGet },
});
monitor = new HeartbeatMonitor(deps);
await monitor.runChecks();
expect(mockSend).toHaveBeenCalledTimes(1);
Object.assign(deps, { modelRouter: { getTier: () => 'default' } });
await monitor.runChecks();
expect(mockSend).toHaveBeenCalledTimes(2);
Object.assign(deps, { modelRouter: undefined });
await monitor.runChecks();
expect(mockSend).toHaveBeenCalledTimes(2);
});
it('recovery notification sent when checks pass after failures', async () => {
const mockSend = vi.fn().mockResolvedValue(undefined);
const mockGet = vi.fn().mockReturnValue({ send: mockSend });
+56 -12
View File
@@ -69,7 +69,11 @@ export class HeartbeatMonitor {
private timer: ReturnType<typeof setInterval> | undefined;
private lastResult: HeartbeatResult | undefined;
private consecutiveFailures = 0;
private notifiedFailure = false;
private failureAlertSentForCurrentIncident = false;
private failureAlertProcessedForCurrentIncident = false;
private lastFailureNotificationAt = 0;
private lastFailureSignature = '';
private lastRecoveryNotificationAt = 0;
private readonly deps: HeartbeatDeps;
constructor(deps: HeartbeatDeps) {
@@ -172,28 +176,38 @@ export class HeartbeatMonitor {
// Failure tracking and notification
if (!healthy) {
this.consecutiveFailures++;
if (this.consecutiveFailures >= this.deps.config.failure_threshold && !this.notifiedFailure) {
this.notifiedFailure = true;
if (this.consecutiveFailures >= this.deps.config.failure_threshold && !this.failureAlertProcessedForCurrentIncident) {
this.failureAlertProcessedForCurrentIncident = true;
const failedChecks = checks.filter((c) => !c.healthy).map((c) => `${c.name}: ${c.message}`);
await this.notify(`Heartbeat FAILING (${this.consecutiveFailures} consecutive failures):\n${failedChecks.join('\n')}`);
const signature = failedChecks.join('|');
const sent = await this.notifyFailureWithCooldown(
`Heartbeat FAILING (${this.consecutiveFailures} consecutive failures):\n${failedChecks.join('\n')}`,
signature,
);
this.failureAlertSentForCurrentIncident = sent;
auditLogger?.heartbeatFail({
checks_failed: failedChecks,
consecutive_failures: this.consecutiveFailures,
threshold: this.deps.config.failure_threshold,
});
if (sent) {
auditLogger?.heartbeatFail({
checks_failed: failedChecks,
consecutive_failures: this.consecutiveFailures,
threshold: this.deps.config.failure_threshold,
});
}
}
} else {
if (this.notifiedFailure) {
if (this.failureAlertSentForCurrentIncident) {
// Recovery notification
await this.notify(`Heartbeat RECOVERED after ${this.consecutiveFailures} consecutive failure(s). All checks passing.`);
await this.notifyRecoveryWithCooldown(
`Heartbeat RECOVERED after ${this.consecutiveFailures} consecutive failure(s). All checks passing.`,
);
auditLogger?.heartbeatRecover({
consecutive_failures_before: this.consecutiveFailures,
});
}
this.consecutiveFailures = 0;
this.notifiedFailure = false;
this.failureAlertSentForCurrentIncident = false;
this.failureAlertProcessedForCurrentIncident = false;
}
auditLogger?.heartbeatCycle({
@@ -466,4 +480,34 @@ export class HeartbeatMonitor {
console.error('HeartbeatMonitor: failed to send notification:', err);
}
}
private shouldNotifyByCooldown(lastAt: number, cooldownMs: number): boolean {
return Date.now() - lastAt >= cooldownMs;
}
private async notifyFailureWithCooldown(text: string, signature: string): Promise<boolean> {
const cooldownMs = parseInterval(this.deps.config.notify_cooldown ?? '30m');
const signatureChanged = signature !== this.lastFailureSignature;
const cooldownPassed = this.shouldNotifyByCooldown(this.lastFailureNotificationAt, cooldownMs);
if (!signatureChanged && !cooldownPassed) {
return false;
}
await this.notify(text);
this.lastFailureNotificationAt = Date.now();
this.lastFailureSignature = signature;
return true;
}
private async notifyRecoveryWithCooldown(text: string): Promise<boolean> {
const cooldownMs = parseInterval(this.deps.config.notify_cooldown ?? '30m');
const cooldownPassed = this.shouldNotifyByCooldown(this.lastRecoveryNotificationAt, cooldownMs);
if (!cooldownPassed) {
return false;
}
await this.notify(text);
this.lastRecoveryNotificationAt = Date.now();
return true;
}
}
+17 -1
View File
@@ -1,5 +1,5 @@
import { describe, it, expect, afterEach } from 'vitest';
import { runChecks, type CheckResult, type DoctorContext } from './doctor.js';
import { computeDoctorExitCode, runChecks, type CheckResult, type DoctorContext } from './doctor.js';
import { writeFileSync, mkdirSync, rmSync } from 'fs';
import { join } from 'path';
import { tmpdir } from 'os';
@@ -11,6 +11,22 @@ describe('doctor checks', () => {
try { rmSync(testDir, { recursive: true }); } catch {}
});
it('computeDoctorExitCode returns 0 with warnings in non-strict mode', () => {
const results: CheckResult[] = [
{ status: 'pass', label: 'a' },
{ status: 'warn', label: 'b' },
];
expect(computeDoctorExitCode(results, false)).toBe(0);
});
it('computeDoctorExitCode returns 1 with warnings in strict mode', () => {
const results: CheckResult[] = [
{ status: 'pass', label: 'a' },
{ status: 'warn', label: 'b' },
];
expect(computeDoctorExitCode(results, true)).toBe(1);
});
it('reports PASS when config file exists and is valid', async () => {
mkdirSync(testDir, { recursive: true });
const configPath = join(testDir, 'config.yaml');
+18 -2
View File
@@ -632,12 +632,25 @@ export async function runChecks(ctx: DoctorContext): Promise<CheckResult[]> {
return results;
}
export function computeDoctorExitCode(results: CheckResult[], strict: boolean): number {
const failCount = results.filter((r) => r.status === 'fail').length;
const warnCount = results.filter((r) => r.status === 'warn').length;
if (failCount > 0) {
return 1;
}
if (strict && warnCount > 0) {
return 1;
}
return 0;
}
export function registerDoctorCommand(program: Command): void {
program
.command('doctor')
.description('Validate configuration and check system health')
.option('-c, --config <path>', 'Config file path')
.action(async (opts: { config?: string }) => {
.option('--strict', 'Treat warnings as failures')
.action(async (opts: { config?: string; strict?: boolean }) => {
const configPath = opts.config ?? getConfigPath();
const dataDir = getDataDir();
@@ -662,7 +675,10 @@ export function registerDoctorCommand(program: Command): void {
};
console.log(`Results: ${counts.pass} passed, ${counts.fail} failed, ${counts.warn} warnings, ${counts.skip} skipped`);
if (opts.strict && counts.warn > 0) {
console.log('Strict mode enabled: warnings are treated as failures.');
}
process.exit(counts.fail > 0 ? 1 : 0);
process.exit(computeDoctorExitCode(results, Boolean(opts.strict)));
});
}
+24
View File
@@ -57,6 +57,30 @@ const GOOGLE_SERVICES: GoogleService[] = [
];
export async function setupAutomation(p: Prompter, builder: ConfigBuilder): Promise<void> {
const enableOperatorPack = await p.confirm(
'Enable operator automation pack (scheduled backups + heartbeat alerts + daily briefing + MinIO sync)?',
false,
);
if (enableOperatorPack) {
const config = builder.build();
const telegramPeer = config.telegram?.allowed_chat_ids?.[0];
const defaultOutputChannel = telegramPeer ? 'telegram' : 'webchat';
const defaultOutputPeer = telegramPeer ? String(telegramPeer) : 'operator';
const backupSchedule = await p.ask('Backup cron schedule', '0 2 * * *');
const dailyBriefingSchedule = await p.ask('Daily briefing cron schedule', '0 8 * * *');
const enableMinioSync = await p.confirm('Include default MinIO sync task?', true);
builder.applyOperatorPack({
outputChannel: defaultOutputChannel,
outputPeer: defaultOutputPeer,
backupSchedule,
dailyBriefingSchedule,
enableMinioSync,
});
p.println(`✓ Operator pack enabled (alerts routed to ${defaultOutputChannel}/${defaultOutputPeer})`);
}
const cron = await p.confirm('Enable cron scheduler?', false);
if (cron) {
builder.setCronEnabled();
+19
View File
@@ -84,4 +84,23 @@ describe('ConfigBuilder', () => {
const obj = builder.build();
expect(obj.server.token).toBe('my-secret-token');
});
it('applies operator automation pack defaults', () => {
const builder = new ConfigBuilder();
builder.applyOperatorPack({
outputChannel: 'telegram',
outputPeer: '123',
backupSchedule: '0 2 * * *',
dailyBriefingSchedule: '0 8 * * *',
enableMinioSync: true,
});
const obj = builder.build();
expect(obj.backup?.enabled).toBe(true);
expect(obj.backup?.schedule).toBe('0 2 * * *');
expect(obj.backup?.run_on_start).toBe(true);
expect((obj.automation as Record<string, unknown>)?.heartbeat).toBeDefined();
expect((obj.automation as Record<string, unknown>)?.daily_briefing).toBeDefined();
expect((obj.automation as Record<string, unknown>)?.minio_sync).toBeDefined();
});
});
+62
View File
@@ -43,9 +43,23 @@ export interface SetupConfig {
gtasks?: { enabled?: boolean };
heartbeat?: { enabled?: boolean };
} & Record<string, unknown>;
backup?: {
enabled?: boolean;
schedule?: string;
run_on_start?: boolean;
notify?: { channel: string; peer: string };
} & Record<string, unknown>;
[key: string]: unknown;
}
interface OperatorPackOptions {
outputChannel: string;
outputPeer: string;
backupSchedule: string;
dailyBriefingSchedule: string;
enableMinioSync?: boolean;
}
export class ConfigBuilder {
private config: SetupConfig;
@@ -187,6 +201,54 @@ export class ConfigBuilder {
this.config.automation = automation;
}
applyOperatorPack(options: OperatorPackOptions): void {
const automation = (this.config.automation ?? {}) as Record<string, unknown>;
const backup = (this.config.backup ?? {}) as Record<string, unknown>;
backup.enabled = true;
backup.schedule = options.backupSchedule;
backup.run_on_start = true;
backup.notify = { channel: options.outputChannel, peer: options.outputPeer };
automation.heartbeat = {
enabled: true,
notify: { channel: options.outputChannel, peer: options.outputPeer },
interval: '5m',
failure_threshold: 2,
notify_cooldown: '30m',
};
automation.daily_briefing = {
enabled: true,
schedule: options.dailyBriefingSchedule,
output: { channel: options.outputChannel, peer: options.outputPeer },
dedupe_per_local_day: true,
model_tier: 'fast',
};
if (options.enableMinioSync ?? true) {
automation.minio_sync = {
enabled: true,
interval: '6h',
run_on_start: true,
notify: { channel: options.outputChannel, peer: options.outputPeer },
tasks: [
{
prefix: 'knowledge/',
namespace_base: 'global/knowledge/minio',
mode: 'append',
max_objects: 20,
max_chars_per_object: 8000,
force: false,
},
],
};
}
this.config.automation = automation;
this.config.backup = backup;
}
build(): SetupConfig {
return structuredClone(this.config) as SetupConfig;
}
+1
View File
@@ -1002,6 +1002,7 @@ describe('configSchema automation', () => {
it('defaults heartbeat extended thresholds and checks', () => {
const result = configSchema.parse(baseConfig);
expect(result.automation.heartbeat.notify_cooldown).toBe('30m');
expect(result.automation.heartbeat.process_memory_threshold_mb).toBe(1500);
expect(result.automation.heartbeat.backup_failure_threshold).toBe(1);
expect(result.automation.heartbeat.provider_error_rate_threshold).toBe(0.5);
+1
View File
@@ -308,6 +308,7 @@ const heartbeatCheckSchema = z.enum(['gateway', 'model', 'channels', 'memory', '
const heartbeatSchema = z.object({
enabled: z.boolean().default(false),
interval: z.string().default('5m'),
notify_cooldown: z.string().default('30m'),
checks: z.array(heartbeatCheckSchema).default(['gateway', 'model', 'channels', 'memory', 'disk', 'process_memory', 'backup', 'provider_errors']),
notify: z.object({
channel: z.string().min(1),