diff --git a/README.md b/README.md index d34d817..19e2fed 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,9 @@ flynn send "What's the weather in London?" # Check system health flynn doctor --config ~/.config/flynn/config.yaml +# Treat warnings as failures (useful in CI) +flynn doctor --strict + # Show current config (secrets masked) flynn config @@ -705,6 +708,7 @@ automation: heartbeat: enabled: true interval: "5m" # Check every 5 minutes + notify_cooldown: "30m" # Suppress repeated alerts inside cooldown window checks: [gateway, model, channels, memory, disk, process_memory, backup, provider_errors] notify: channel: telegram @@ -731,6 +735,7 @@ automation: | `provider_errors` | Model provider error rates stay below threshold | The monitor sends a notification when failures reach the configured threshold and a recovery notification when all checks pass again. +Repeated failure/recovery notifications are throttled by `notify_cooldown`. ### Heartbeat Config Fields @@ -738,7 +743,8 @@ The monitor sends a notification when failures reach the configured threshold an |-------|----------|-------------| | `enabled` | no | Enable the heartbeat monitor (default: `false`) | | `interval` | no | Check interval: `60s`, `5m`, `1h` (default: `5m`) | -| `checks` | no | Which checks to run (default: all five) | +| `notify_cooldown` | no | Minimum time between repeated heartbeat notifications of the same type (default: `30m`) | +| `checks` | no | Which checks to run (default: `gateway, model, channels, memory, disk, process_memory, backup, provider_errors`) | | `notify.channel` | no | Channel to send failure/recovery notifications | | `notify.peer` | no | Peer/chat ID for notifications | | `failure_threshold` | no | Consecutive failures before notifying (default: `2`) | @@ -748,6 +754,23 @@ The monitor sends a notification when failures reach the configured threshold an | `provider_error_rate_threshold` | no | Error-rate threshold (0..1) for `provider_errors` check (default: `0.5`) | | `provider_error_min_calls` | no | Minimum provider calls before applying error-rate threshold (default: `5`) | +### Common Schedules and Routing + +- Nightly backups to Telegram alerts: + - `backup.schedule: "0 2 * * *"` + - `backup.notify.channel: telegram` +- Weekday daily briefing to Discord: + - `automation.daily_briefing.schedule: "0 8 * * 1-5"` + - `automation.daily_briefing.output.channel: discord` +- High-frequency heartbeat to Slack: + - `automation.heartbeat.interval: "2m"` + - `automation.heartbeat.notify.channel: slack` +- MinIO sync every 6h to WebChat: + - `automation.minio_sync.interval: "6h"` + - `automation.minio_sync.notify.channel: webchat` + +`flynn setup` now includes an Operator Pack option in Automation that preconfigures scheduled backups, heartbeat alerts, a daily briefing, and a default MinIO sync task. + ## Gmail Pub/Sub Watcher Monitor a Gmail inbox and forward new messages into the agent pipeline. diff --git a/config/default.yaml b/config/default.yaml index 706a47d..856397b 100644 --- a/config/default.yaml +++ b/config/default.yaml @@ -317,6 +317,7 @@ hooks: # heartbeat: # enabled: false # interval: "5m" +# notify_cooldown: "30m" # checks: [gateway, model, channels, memory, disk, process_memory, backup, provider_errors] # notify: # channel: telegram diff --git a/docs/plans/state.json b/docs/plans/state.json index d28c559..2422f92 100644 --- a/docs/plans/state.json +++ b/docs/plans/state.json @@ -189,6 +189,27 @@ ], "test_status": "pnpm test:run src/cli/minioExtractors.test.ts src/cli/doctor.test.ts + pnpm typecheck passing" }, + "operator-pack-heartbeat-throttle-and-doctor-strict": { + "status": "completed", + "date": "2026-02-16", + "updated": "2026-02-16", + "summary": "Implemented operator-focused hardening and onboarding polish: added a setup Automation operator-pack path that preconfigures scheduled backups, heartbeat alerts, daily briefing, and default MinIO sync; added heartbeat notification throttling via `automation.heartbeat.notify_cooldown`; and added `flynn doctor --strict` to treat warnings as failures. Updated docs/default config examples accordingly.", + "files_modified": [ + "src/cli/setup/config.ts", + "src/cli/setup/config.test.ts", + "src/cli/setup/automation.ts", + "src/automation/heartbeat.ts", + "src/automation/heartbeat.test.ts", + "src/config/schema.ts", + "src/config/schema.test.ts", + "src/cli/doctor.ts", + "src/cli/doctor.test.ts", + "config/default.yaml", + "README.md", + "docs/plans/state.json" + ], + "test_status": "pnpm test:run src/cli/setup/config.test.ts src/automation/heartbeat.test.ts src/config/schema.test.ts src/cli/doctor.test.ts + pnpm typecheck passing" + }, "backup-session-summary-audit-trail": { "status": "completed", "date": "2026-02-16", @@ -3473,7 +3494,7 @@ } }, "overall_progress": { - "total_test_count": 1859, + "total_test_count": 1863, "all_tests_passing": true, "p0_completion": "3/3 (100%)", "p1_completion": "4/4 (100%)", diff --git a/src/automation/heartbeat.test.ts b/src/automation/heartbeat.test.ts index ca0591e..44cf15d 100644 --- a/src/automation/heartbeat.test.ts +++ b/src/automation/heartbeat.test.ts @@ -8,6 +8,7 @@ function makeConfig(overrides?: Partial): HeartbeatConfig { return { enabled: true, interval: '5m', + notify_cooldown: '30m', checks: ['gateway', 'model', 'channels', 'memory', 'disk', 'process_memory', 'backup'], failure_threshold: 2, disk_threshold_mb: 100, @@ -227,6 +228,34 @@ describe('HeartbeatMonitor', () => { expect(mockSend).toHaveBeenCalledTimes(1); }); + it('suppresses repeat failure notifications inside notify cooldown after recovery', async () => { + const mockSend = vi.fn().mockResolvedValue(undefined); + const mockGet = vi.fn().mockReturnValue({ send: mockSend }); + + const deps = makeDeps({ + config: makeConfig({ + checks: ['model'], + failure_threshold: 1, + notify_cooldown: '1h', + notify: { channel: 'telegram', peer: '123' }, + }), + modelRouter: undefined, + channelLookup: { get: mockGet }, + }); + monitor = new HeartbeatMonitor(deps); + + await monitor.runChecks(); + expect(mockSend).toHaveBeenCalledTimes(1); + + Object.assign(deps, { modelRouter: { getTier: () => 'default' } }); + await monitor.runChecks(); + expect(mockSend).toHaveBeenCalledTimes(2); + + Object.assign(deps, { modelRouter: undefined }); + await monitor.runChecks(); + expect(mockSend).toHaveBeenCalledTimes(2); + }); + it('recovery notification sent when checks pass after failures', async () => { const mockSend = vi.fn().mockResolvedValue(undefined); const mockGet = vi.fn().mockReturnValue({ send: mockSend }); diff --git a/src/automation/heartbeat.ts b/src/automation/heartbeat.ts index 6561cce..5afb3cf 100644 --- a/src/automation/heartbeat.ts +++ b/src/automation/heartbeat.ts @@ -69,7 +69,11 @@ export class HeartbeatMonitor { private timer: ReturnType | undefined; private lastResult: HeartbeatResult | undefined; private consecutiveFailures = 0; - private notifiedFailure = false; + private failureAlertSentForCurrentIncident = false; + private failureAlertProcessedForCurrentIncident = false; + private lastFailureNotificationAt = 0; + private lastFailureSignature = ''; + private lastRecoveryNotificationAt = 0; private readonly deps: HeartbeatDeps; constructor(deps: HeartbeatDeps) { @@ -172,28 +176,38 @@ export class HeartbeatMonitor { // Failure tracking and notification if (!healthy) { this.consecutiveFailures++; - if (this.consecutiveFailures >= this.deps.config.failure_threshold && !this.notifiedFailure) { - this.notifiedFailure = true; + if (this.consecutiveFailures >= this.deps.config.failure_threshold && !this.failureAlertProcessedForCurrentIncident) { + this.failureAlertProcessedForCurrentIncident = true; const failedChecks = checks.filter((c) => !c.healthy).map((c) => `${c.name}: ${c.message}`); - await this.notify(`Heartbeat FAILING (${this.consecutiveFailures} consecutive failures):\n${failedChecks.join('\n')}`); + const signature = failedChecks.join('|'); + const sent = await this.notifyFailureWithCooldown( + `Heartbeat FAILING (${this.consecutiveFailures} consecutive failures):\n${failedChecks.join('\n')}`, + signature, + ); + this.failureAlertSentForCurrentIncident = sent; - auditLogger?.heartbeatFail({ - checks_failed: failedChecks, - consecutive_failures: this.consecutiveFailures, - threshold: this.deps.config.failure_threshold, - }); + if (sent) { + auditLogger?.heartbeatFail({ + checks_failed: failedChecks, + consecutive_failures: this.consecutiveFailures, + threshold: this.deps.config.failure_threshold, + }); + } } } else { - if (this.notifiedFailure) { + if (this.failureAlertSentForCurrentIncident) { // Recovery notification - await this.notify(`Heartbeat RECOVERED after ${this.consecutiveFailures} consecutive failure(s). All checks passing.`); + await this.notifyRecoveryWithCooldown( + `Heartbeat RECOVERED after ${this.consecutiveFailures} consecutive failure(s). All checks passing.`, + ); auditLogger?.heartbeatRecover({ consecutive_failures_before: this.consecutiveFailures, }); } this.consecutiveFailures = 0; - this.notifiedFailure = false; + this.failureAlertSentForCurrentIncident = false; + this.failureAlertProcessedForCurrentIncident = false; } auditLogger?.heartbeatCycle({ @@ -466,4 +480,34 @@ export class HeartbeatMonitor { console.error('HeartbeatMonitor: failed to send notification:', err); } } + + private shouldNotifyByCooldown(lastAt: number, cooldownMs: number): boolean { + return Date.now() - lastAt >= cooldownMs; + } + + private async notifyFailureWithCooldown(text: string, signature: string): Promise { + const cooldownMs = parseInterval(this.deps.config.notify_cooldown ?? '30m'); + const signatureChanged = signature !== this.lastFailureSignature; + const cooldownPassed = this.shouldNotifyByCooldown(this.lastFailureNotificationAt, cooldownMs); + if (!signatureChanged && !cooldownPassed) { + return false; + } + + await this.notify(text); + this.lastFailureNotificationAt = Date.now(); + this.lastFailureSignature = signature; + return true; + } + + private async notifyRecoveryWithCooldown(text: string): Promise { + const cooldownMs = parseInterval(this.deps.config.notify_cooldown ?? '30m'); + const cooldownPassed = this.shouldNotifyByCooldown(this.lastRecoveryNotificationAt, cooldownMs); + if (!cooldownPassed) { + return false; + } + + await this.notify(text); + this.lastRecoveryNotificationAt = Date.now(); + return true; + } } diff --git a/src/cli/doctor.test.ts b/src/cli/doctor.test.ts index 4cab918..75167b8 100644 --- a/src/cli/doctor.test.ts +++ b/src/cli/doctor.test.ts @@ -1,5 +1,5 @@ import { describe, it, expect, afterEach } from 'vitest'; -import { runChecks, type CheckResult, type DoctorContext } from './doctor.js'; +import { computeDoctorExitCode, runChecks, type CheckResult, type DoctorContext } from './doctor.js'; import { writeFileSync, mkdirSync, rmSync } from 'fs'; import { join } from 'path'; import { tmpdir } from 'os'; @@ -11,6 +11,22 @@ describe('doctor checks', () => { try { rmSync(testDir, { recursive: true }); } catch {} }); + it('computeDoctorExitCode returns 0 with warnings in non-strict mode', () => { + const results: CheckResult[] = [ + { status: 'pass', label: 'a' }, + { status: 'warn', label: 'b' }, + ]; + expect(computeDoctorExitCode(results, false)).toBe(0); + }); + + it('computeDoctorExitCode returns 1 with warnings in strict mode', () => { + const results: CheckResult[] = [ + { status: 'pass', label: 'a' }, + { status: 'warn', label: 'b' }, + ]; + expect(computeDoctorExitCode(results, true)).toBe(1); + }); + it('reports PASS when config file exists and is valid', async () => { mkdirSync(testDir, { recursive: true }); const configPath = join(testDir, 'config.yaml'); diff --git a/src/cli/doctor.ts b/src/cli/doctor.ts index 94c15f4..c09901b 100644 --- a/src/cli/doctor.ts +++ b/src/cli/doctor.ts @@ -632,12 +632,25 @@ export async function runChecks(ctx: DoctorContext): Promise { return results; } +export function computeDoctorExitCode(results: CheckResult[], strict: boolean): number { + const failCount = results.filter((r) => r.status === 'fail').length; + const warnCount = results.filter((r) => r.status === 'warn').length; + if (failCount > 0) { + return 1; + } + if (strict && warnCount > 0) { + return 1; + } + return 0; +} + export function registerDoctorCommand(program: Command): void { program .command('doctor') .description('Validate configuration and check system health') .option('-c, --config ', 'Config file path') - .action(async (opts: { config?: string }) => { + .option('--strict', 'Treat warnings as failures') + .action(async (opts: { config?: string; strict?: boolean }) => { const configPath = opts.config ?? getConfigPath(); const dataDir = getDataDir(); @@ -662,7 +675,10 @@ export function registerDoctorCommand(program: Command): void { }; console.log(`Results: ${counts.pass} passed, ${counts.fail} failed, ${counts.warn} warnings, ${counts.skip} skipped`); + if (opts.strict && counts.warn > 0) { + console.log('Strict mode enabled: warnings are treated as failures.'); + } - process.exit(counts.fail > 0 ? 1 : 0); + process.exit(computeDoctorExitCode(results, Boolean(opts.strict))); }); } diff --git a/src/cli/setup/automation.ts b/src/cli/setup/automation.ts index 0621ec2..b20838f 100644 --- a/src/cli/setup/automation.ts +++ b/src/cli/setup/automation.ts @@ -57,6 +57,30 @@ const GOOGLE_SERVICES: GoogleService[] = [ ]; export async function setupAutomation(p: Prompter, builder: ConfigBuilder): Promise { + const enableOperatorPack = await p.confirm( + 'Enable operator automation pack (scheduled backups + heartbeat alerts + daily briefing + MinIO sync)?', + false, + ); + if (enableOperatorPack) { + const config = builder.build(); + const telegramPeer = config.telegram?.allowed_chat_ids?.[0]; + const defaultOutputChannel = telegramPeer ? 'telegram' : 'webchat'; + const defaultOutputPeer = telegramPeer ? String(telegramPeer) : 'operator'; + + const backupSchedule = await p.ask('Backup cron schedule', '0 2 * * *'); + const dailyBriefingSchedule = await p.ask('Daily briefing cron schedule', '0 8 * * *'); + const enableMinioSync = await p.confirm('Include default MinIO sync task?', true); + + builder.applyOperatorPack({ + outputChannel: defaultOutputChannel, + outputPeer: defaultOutputPeer, + backupSchedule, + dailyBriefingSchedule, + enableMinioSync, + }); + p.println(`✓ Operator pack enabled (alerts routed to ${defaultOutputChannel}/${defaultOutputPeer})`); + } + const cron = await p.confirm('Enable cron scheduler?', false); if (cron) { builder.setCronEnabled(); diff --git a/src/cli/setup/config.test.ts b/src/cli/setup/config.test.ts index 84b0768..23ba60d 100644 --- a/src/cli/setup/config.test.ts +++ b/src/cli/setup/config.test.ts @@ -84,4 +84,23 @@ describe('ConfigBuilder', () => { const obj = builder.build(); expect(obj.server.token).toBe('my-secret-token'); }); + + it('applies operator automation pack defaults', () => { + const builder = new ConfigBuilder(); + builder.applyOperatorPack({ + outputChannel: 'telegram', + outputPeer: '123', + backupSchedule: '0 2 * * *', + dailyBriefingSchedule: '0 8 * * *', + enableMinioSync: true, + }); + + const obj = builder.build(); + expect(obj.backup?.enabled).toBe(true); + expect(obj.backup?.schedule).toBe('0 2 * * *'); + expect(obj.backup?.run_on_start).toBe(true); + expect((obj.automation as Record)?.heartbeat).toBeDefined(); + expect((obj.automation as Record)?.daily_briefing).toBeDefined(); + expect((obj.automation as Record)?.minio_sync).toBeDefined(); + }); }); diff --git a/src/cli/setup/config.ts b/src/cli/setup/config.ts index ddb37cb..00a4496 100644 --- a/src/cli/setup/config.ts +++ b/src/cli/setup/config.ts @@ -43,9 +43,23 @@ export interface SetupConfig { gtasks?: { enabled?: boolean }; heartbeat?: { enabled?: boolean }; } & Record; + backup?: { + enabled?: boolean; + schedule?: string; + run_on_start?: boolean; + notify?: { channel: string; peer: string }; + } & Record; [key: string]: unknown; } +interface OperatorPackOptions { + outputChannel: string; + outputPeer: string; + backupSchedule: string; + dailyBriefingSchedule: string; + enableMinioSync?: boolean; +} + export class ConfigBuilder { private config: SetupConfig; @@ -187,6 +201,54 @@ export class ConfigBuilder { this.config.automation = automation; } + applyOperatorPack(options: OperatorPackOptions): void { + const automation = (this.config.automation ?? {}) as Record; + const backup = (this.config.backup ?? {}) as Record; + + backup.enabled = true; + backup.schedule = options.backupSchedule; + backup.run_on_start = true; + backup.notify = { channel: options.outputChannel, peer: options.outputPeer }; + + automation.heartbeat = { + enabled: true, + notify: { channel: options.outputChannel, peer: options.outputPeer }, + interval: '5m', + failure_threshold: 2, + notify_cooldown: '30m', + }; + + automation.daily_briefing = { + enabled: true, + schedule: options.dailyBriefingSchedule, + output: { channel: options.outputChannel, peer: options.outputPeer }, + dedupe_per_local_day: true, + model_tier: 'fast', + }; + + if (options.enableMinioSync ?? true) { + automation.minio_sync = { + enabled: true, + interval: '6h', + run_on_start: true, + notify: { channel: options.outputChannel, peer: options.outputPeer }, + tasks: [ + { + prefix: 'knowledge/', + namespace_base: 'global/knowledge/minio', + mode: 'append', + max_objects: 20, + max_chars_per_object: 8000, + force: false, + }, + ], + }; + } + + this.config.automation = automation; + this.config.backup = backup; + } + build(): SetupConfig { return structuredClone(this.config) as SetupConfig; } diff --git a/src/config/schema.test.ts b/src/config/schema.test.ts index e9085c0..772e9f4 100644 --- a/src/config/schema.test.ts +++ b/src/config/schema.test.ts @@ -1002,6 +1002,7 @@ describe('configSchema automation', () => { it('defaults heartbeat extended thresholds and checks', () => { const result = configSchema.parse(baseConfig); + expect(result.automation.heartbeat.notify_cooldown).toBe('30m'); expect(result.automation.heartbeat.process_memory_threshold_mb).toBe(1500); expect(result.automation.heartbeat.backup_failure_threshold).toBe(1); expect(result.automation.heartbeat.provider_error_rate_threshold).toBe(0.5); diff --git a/src/config/schema.ts b/src/config/schema.ts index 0836e76..0c5c068 100644 --- a/src/config/schema.ts +++ b/src/config/schema.ts @@ -308,6 +308,7 @@ const heartbeatCheckSchema = z.enum(['gateway', 'model', 'channels', 'memory', ' const heartbeatSchema = z.object({ enabled: z.boolean().default(false), interval: z.string().default('5m'), + notify_cooldown: z.string().default('30m'), checks: z.array(heartbeatCheckSchema).default(['gateway', 'model', 'channels', 'memory', 'disk', 'process_memory', 'backup', 'provider_errors']), notify: z.object({ channel: z.string().min(1),