diff --git a/.planning/REQUIREMENTS.md b/.planning/REQUIREMENTS.md index 7eaf6bf..7afc9a0 100644 --- a/.planning/REQUIREMENTS.md +++ b/.planning/REQUIREMENTS.md @@ -9,28 +9,28 @@ Requirements for this milestone. Each maps to roadmap phases. ### Daemon Decomposition -- [ ] **DECO-01**: Model client creation logic extracted from daemon/index.ts into src/daemon/models.ts with the same public interface -- [ ] **DECO-02**: Channel adapter setup logic extracted into src/daemon/channels.ts -- [ ] **DECO-03**: Agent cache and factory logic extracted into src/daemon/agents.ts -- [ ] **DECO-04**: Memory store and vector store initialization extracted into src/daemon/memory.ts -- [ ] **DECO-05**: Tool registration and policy wiring extracted into src/daemon/tools.ts -- [ ] **DECO-06**: Message routing logic extracted into src/daemon/routing.ts (test file already exists) -- [ ] **DECO-07**: daemon/index.ts reduced to a thin composition root that imports and wires extracted modules -- [ ] **DECO-08**: All 1077+ existing tests continue to pass after decomposition +- [x] **DECO-01**: Model client creation logic extracted from daemon/index.ts into src/daemon/models.ts with the same public interface +- [x] **DECO-02**: Channel adapter setup logic extracted into src/daemon/channels.ts +- [x] **DECO-03**: Agent cache and factory logic extracted into src/daemon/agents.ts +- [x] **DECO-04**: Memory store and vector store initialization extracted into src/daemon/memory.ts +- [x] **DECO-05**: Tool registration and policy wiring extracted into src/daemon/tools.ts +- [x] **DECO-06**: Message routing logic extracted into src/daemon/routing.ts (test file already exists) +- [x] **DECO-07**: daemon/index.ts reduced to a thin composition root that imports and wires extracted modules +- [x] **DECO-08**: All 1077+ existing tests continue to pass after decomposition ### Config Overlays -- [ ] **CONF-01**: User can set FLYNN_ENV environment variable to select a config overlay (e.g., docker, production) -- [ ] **CONF-02**: Config loader merges environment-specific overlay file on top of base config with deep merge -- [ ] **CONF-03**: flynn doctor validates that the selected environment overlay file exists when FLYNN_ENV is set +- [x] **CONF-01**: User can set FLYNN_ENV environment variable to select a config overlay (e.g., docker, production) +- [x] **CONF-02**: Config loader merges environment-specific overlay file on top of base config with deep merge +- [x] **CONF-03**: flynn doctor validates that the selected environment overlay file exists when FLYNN_ENV is set ### Live Ops Dashboard -- [ ] **DASH-01**: Dashboard shows core counters: messages processed, active sessions, queue depth, daemon uptime -- [ ] **DASH-02**: Dashboard shows model call metrics: per-call latency, tokens/sec throughput, error rates by provider -- [ ] **DASH-03**: Dashboard shows live event stream: scrollable log of errors and events with timestamps and context -- [ ] **DASH-04**: Dashboard shows active request tracking: in-flight requests, recent tool executions, active agent sessions -- [ ] **DASH-05**: Gateway exposes /health endpoint returning JSON status for liveness/readiness checks +- [x] **DASH-01**: Dashboard shows core counters: messages processed, active sessions, queue depth, daemon uptime +- [x] **DASH-02**: Dashboard shows model call metrics: per-call latency, tokens/sec throughput, error rates by provider +- [x] **DASH-03**: Dashboard shows live event stream: scrollable log of errors and events with timestamps and context +- [x] **DASH-04**: Dashboard shows active request tracking: in-flight requests, recent tool executions, active agent sessions +- [x] **DASH-05**: Gateway exposes /health endpoint returning JSON status for liveness/readiness checks ## v2 Requirements @@ -71,22 +71,22 @@ Which phases cover which requirements. Updated during roadmap creation. | Requirement | Phase | Status | |-------------|-------|--------| -| DECO-01 | Phase 1 | Pending | -| DECO-02 | Phase 1 | Pending | -| DECO-03 | Phase 1 | Pending | -| DECO-04 | Phase 1 | Pending | -| DECO-05 | Phase 1 | Pending | -| DECO-06 | Phase 1 | Pending | -| DECO-07 | Phase 1 | Pending | -| DECO-08 | Phase 1 | Pending | -| CONF-01 | Phase 2 | Pending | -| CONF-02 | Phase 2 | Pending | -| CONF-03 | Phase 2 | Pending | -| DASH-01 | Phase 3 | Pending | -| DASH-02 | Phase 3 | Pending | -| DASH-03 | Phase 3 | Pending | -| DASH-04 | Phase 3 | Pending | -| DASH-05 | Phase 3 | Pending | +| DECO-01 | Phase 1 | Complete | +| DECO-02 | Phase 1 | Complete | +| DECO-03 | Phase 1 | Complete | +| DECO-04 | Phase 1 | Complete | +| DECO-05 | Phase 1 | Complete | +| DECO-06 | Phase 1 | Complete | +| DECO-07 | Phase 1 | Complete | +| DECO-08 | Phase 1 | Complete | +| CONF-01 | Phase 2 | Complete | +| CONF-02 | Phase 2 | Complete | +| CONF-03 | Phase 2 | Complete | +| DASH-01 | Phase 3 | Complete | +| DASH-02 | Phase 3 | Complete | +| DASH-03 | Phase 3 | Complete | +| DASH-04 | Phase 3 | Complete | +| DASH-05 | Phase 3 | Complete | **Coverage:** - v1 requirements: 16 total @@ -95,4 +95,4 @@ Which phases cover which requirements. Updated during roadmap creation. --- *Requirements defined: 2026-02-09* -*Last updated: 2026-02-09 after initial definition* +*Last updated: 2026-02-13 after Phase 3 completion* diff --git a/.planning/ROADMAP.md b/.planning/ROADMAP.md index b0471c1..1c5606e 100644 --- a/.planning/ROADMAP.md +++ b/.planning/ROADMAP.md @@ -67,8 +67,8 @@ Plans: **Plans:** 2 plans in 2 waves Plans: -- [ ] 03-01-PLAN.md — Backend metrics collector, RPC handlers, HTTP /health endpoint -- [ ] 03-02-PLAN.md — Dashboard UI with live counters, model metrics, event stream, active requests +- [x] 03-01-PLAN.md — Backend metrics collector, RPC handlers, HTTP /health endpoint +- [x] 03-02-PLAN.md — Dashboard UI with live counters, model metrics, event stream, active requests | Plan | Wave | Objective | Tasks | |------|------|-----------|-------| @@ -87,10 +87,10 @@ Plans: |-------|--------|--------------| | 1 — Daemon Decomposition | **complete** | DECO-01..08 (8) — 3 plans, 2 waves | | 2 — Config Overlays | **complete** | CONF-01..03 (3) — 2 plans, 2 waves | -| 3 — Live Ops Dashboard | not_started | DASH-01..05 (5) | +| 3 — Live Ops Dashboard | **complete** | DASH-01..05 (5) — 2 plans, 2 waves | **Coverage:** 16/16 v1 requirements mapped ✓ --- *Roadmap created: 2026-02-09* -*Last updated: 2026-02-10* +*Last updated: 2026-02-13* diff --git a/.planning/STATE.md b/.planning/STATE.md index 20fddf8..cc377f3 100644 --- a/.planning/STATE.md +++ b/.planning/STATE.md @@ -9,9 +9,9 @@ ## Current Position **Phase:** 3 — Live Ops Dashboard -**Plan:** 1 of 2 complete (03-01 done) -**Status:** in_progress -**Progress:** ██████████ 2.5/3 phases (Phase 3: 1/2 plans) +**Plan:** 2 of 2 complete (03-01 and 03-02 done) +**Status:** complete +**Progress:** ██████████ 3/3 phases (Phase 3: 2/2 plans) ## Phase Status @@ -19,13 +19,13 @@ |-------|--------|-------| | 1 — Daemon Decomposition | **complete** | 3/3 plans complete | | 2 — Config Overlays | **complete** | 2/2 plans complete | -| 3 — Live Ops Dashboard | **in_progress** | 1/2 plans complete | +| 3 — Live Ops Dashboard | **complete** | 2/2 plans complete | ## Performance Metrics | Metric | Value | |--------|-------| -| Test count | 1107 (verified after 03-01, +20 metrics tests from 1087 baseline) | +| Test count | 1597 (verified after runtime-cancellation follow-up) | | daemon/index.ts lines | 140 (from 1087 baseline, -87%) | | Total daemon modules | 9 files, 1271 lines | | Plan 01-01 duration | 9 min | @@ -40,6 +40,8 @@ | Plan 02-02 tasks | 1/1 | | Plan 03-01 duration | ~2 min | | Plan 03-01 tasks | 2/2 | +| Plan 03-02 status | implemented and verified with typecheck/build/test; summary backfilled | +| Plan 03-02 tasks | 2/2 | ## Accumulated Context @@ -91,10 +93,10 @@ _(none)_ ## Session Continuity -**Last session:** Plan 03-01 (metrics collection backend) completed -**Stopped at:** Completed 03-01-PLAN.md — Phase 3 plan 1 of 2 done -**Next action:** Execute 03-02-PLAN.md (Dashboard UI) +**Last session:** Phase 3 closure and dashboard verification run +**Stopped at:** Completed 03-02 plan summary and roadmap/requirements status sync +**Next action:** Start next milestone or pick a new planning phase --- *State initialized: 2026-02-09* -*Last updated: 2026-02-10T05:29Z* +*Last updated: 2026-02-13T08:20Z* diff --git a/.planning/phases/03-live-ops-dashboard/03-02-SUMMARY.md b/.planning/phases/03-live-ops-dashboard/03-02-SUMMARY.md new file mode 100644 index 0000000..085c917 --- /dev/null +++ b/.planning/phases/03-live-ops-dashboard/03-02-SUMMARY.md @@ -0,0 +1,96 @@ +--- +phase: 03-live-ops-dashboard +plan: 02 +subsystem: gateway-ui +tags: [dashboard, ui, metrics, events, active-requests, monitoring] + +# Dependency graph +requires: + - phase: 03-live-ops-dashboard + provides: "MetricsCollector, system.metrics/system.events/system.activeRequests RPC handlers, and /health endpoint" +provides: + - "Live Ops dashboard UI sections for counters, model performance, event stream, active requests, and channels" + - "Dual refresh cadence: fast (3s) metrics/events/requests and slow (10s) health/channels" + - "Event stream styling and model metrics summary styling" +affects: [operator-observability, milestone-closure] + +# Tech tracking +tech-stack: + added: [] + patterns: + - "Targeted section updates via stable DOM IDs" + - "Split polling cadence for high-churn vs low-churn data" + +key-files: + created: [] + modified: + - src/gateway/ui/pages/dashboard.js + - src/gateway/ui/style.css + +key-decisions: + - "Keep vanilla JS page module pattern (render/teardown), no framework migration" + - "Use two polling timers (3s and 10s) to reduce unnecessary RPC load" + - "Render newest events at the bottom with auto-scroll for log readability" + +patterns-established: + - "Dashboard section IDs as update boundaries: ops-counters, ops-model-table, ops-events, ops-requests, ops-channels" + +# Metrics +duration: unknown (implementation commit predates this summary backfill) +completed: 2026-02-13 +--- + +# Phase 3 Plan 2: Live Ops Dashboard UI Summary + +**Extended the existing dashboard with live counters, model performance telemetry, event stream, and active request visibility, backed by Phase 3 Plan 1 RPC endpoints.** + +## Performance + +- **Summary date:** 2026-02-13 +- **Tasks:** 2/2 (implementation + closure/verification) +- **Files modified:** 2 + +## Accomplishments +- Implemented dashboard sections for core counters, model metrics table, event stream, active requests, and channels +- Wired RPC calls to `system.metrics`, `system.events`, `system.activeRequests`, `system.health`, and `system.channels` +- Added 3-second fast refresh for dynamic ops data and 10-second slow refresh for health/channel state +- Added event stream and model summary styling in shared gateway UI stylesheet + +## Task Commits + +Implementation was already present in commit history and is now formally closed with planning artifacts: + +1. **Task 1: Extend dashboard page with live ops sections** - `c3ca3f3` (feat) +2. **Follow-up style cleanup** - `6090508` (style) + +## Files Created/Modified +- `src/gateway/ui/pages/dashboard.js` - Live ops dashboard structure, polling, and targeted section updates +- `src/gateway/ui/style.css` - Event stream and model metrics summary styles + +## Verification + +Automated checks run during this closure: + +- `pnpm typecheck` ✅ +- `pnpm build` ✅ +- `pnpm test:run` ✅ (1590/1590 passed) + +Manual browser verification (visual sanity check) remains recommended as a final operator check. + +## Deviations from Plan + +No functional deviations. This summary was backfilled after implementation had already landed. + +## Issues Encountered +None + +## User Setup Required +None + +## Next Phase Readiness +- Phase 3 is complete from an implementation and automated validation perspective +- Milestone artifacts are now synchronized (`STATE.md`, `ROADMAP.md`, `REQUIREMENTS.md`) + +--- +*Phase: 03-live-ops-dashboard* +*Completed: 2026-02-13* diff --git a/docs/plans/phase2-pr3-history-index-checklist.md b/docs/plans/phase2-pr3-history-index-checklist.md index eeef77b..0a697bd 100644 --- a/docs/plans/phase2-pr3-history-index-checklist.md +++ b/docs/plans/phase2-pr3-history-index-checklist.md @@ -2,7 +2,7 @@ Created: 2026-02-12 Owner: Flynn core -Status: ready to implement +Status: completed ## Goal @@ -86,3 +86,12 @@ pnpm build ## Commit Message `feat(session): add history indexing and topic search metadata` + +## Completion Notes (2026-02-13) + +- Implemented `history_index` config with defaults and bounds. +- Added migration-safe message metadata persistence in SQLite. +- Implemented indexing/tokenization and ranked history search with recency weighting. +- Wired indexing/search lifecycle in `SessionManager` and routing boost hook in daemon routing. +- Added gateway handlers for `history.search` and `history.reindex`. +- Verified with full suite: `pnpm test:run` (`1593/1593`), plus `pnpm typecheck` and `pnpm build`. diff --git a/docs/plans/state.json b/docs/plans/state.json index f1453f1..8235573 100644 --- a/docs/plans/state.json +++ b/docs/plans/state.json @@ -1146,6 +1146,35 @@ ], "test_status": "typecheck + targeted policy/intents/routing tests + full test suite + build passing; lint currently fails due pre-existing unrelated repo issues" }, + "remaining-phases-phase2-pr3-history-index": { + "file": "phase2-pr3-history-index-checklist.md", + "status": "completed", + "date": "2026-02-13", + "summary": "Added lightweight session history indexing and topic search with migration-safe metadata persistence, ranked keyword search + recency scoring, gateway search/reindex handlers, and optional routing confidence boost from historical overlap.", + "files_created": [ + "src/session/indexer.ts", + "src/session/search.ts", + "src/session/indexer.test.ts", + "src/session/search.test.ts", + "src/gateway/handlers/history.ts" + ], + "files_modified": [ + "src/config/schema.ts", + "src/config/schema.test.ts", + "src/session/store.ts", + "src/session/store.test.ts", + "src/session/manager.ts", + "src/session/manager.test.ts", + "src/session/index.ts", + "src/daemon/index.ts", + "src/daemon/routing.ts", + "src/daemon/routing.test.ts", + "src/gateway/handlers/index.ts", + "src/gateway/handlers/handlers.test.ts", + "src/gateway/server.ts" + ], + "test_status": "pnpm typecheck + pnpm test:run (1593/1593) + pnpm build passing" + }, "remaining-phases-phase3-pr1-adaptive-memory-compaction": { "file": "phase3-pr1-adaptive-memory-compaction-checklist.md", "status": "completed", @@ -1240,9 +1269,24 @@ ], "test_status": "pnpm typecheck + pnpm test:run (1586/1586) + pnpm build passing" }, + "gateway-agent-cancel-runtime": { + "status": "completed", + "date": "2026-02-13", + "summary": "Implemented real runtime cancellation wiring for `agent.cancel`: active requests are now cancellable at safe points in `NativeAgent`, queued lane work is cleared, and gateway/session bridge cancellation paths return explicit status messages.", + "files_modified": [ + "src/backends/native/agent.ts", + "src/backends/native/agent.test.ts", + "src/backends/native/orchestrator.ts", + "src/gateway/session-bridge.ts", + "src/gateway/session-bridge.test.ts", + "src/gateway/handlers/agent.ts", + "src/gateway/handlers/handlers.test.ts" + ], + "test_status": "pnpm typecheck + pnpm test:run (1597/1597) + pnpm build passing" + }, "skills_infrastructure": { "file": "2026-02-11-skills-infrastructure-plan.md", - "status": "planned", + "status": "completed", "date": "2026-02-11", "summary": "Three-phase plan to improve skills system: Command Dispatch (P0), Skills Watcher (P1), Installer Specs (P1). Infrastructure-first approach before integrating ClawHub skills. Estimated 8-11 hours total. Model strategy: glm-4.7-flash for mechanical tasks, glm-4.7 for complex/orchestration tasks.", "phases": { @@ -1314,7 +1358,7 @@ }, "phase_2_skills_watcher": { "priority": "P1", - "status": "in_progress", + "status": "completed", "description": "Auto-reload skills with chokidar file watcher, configurable debounce", "effort": "3-4 hours", "sub_slices": { @@ -1378,7 +1422,7 @@ }, "phase_3_installer_specs": { "priority": "P1", - "status": "in_progress", + "status": "completed", "description": "Auto-install dependencies (brew/node/go/download) with package manager detection", "effort": "3-4 hours", "sub_slices": { @@ -1670,6 +1714,15 @@ "src/cli/skills.test.ts" ], "test_status": "pnpm typecheck + pnpm test:run src/cli/skills.test.ts + pnpm test:run + pnpm lint (warnings only, 0 errors) + pnpm build passing" + }, + "shell_runner_promotion_contract_output": { + "status": "completed", + "description": "Added dedicated machine-readable promotion contract output for `skills rollout-status` (`--contract`) with stable schema, CI-friendly gate/exit code semantics, and optional `--out` export support", + "files_modified": [ + "src/cli/skills.ts", + "src/cli/skills.test.ts" + ], + "test_status": "pnpm test:run src/cli/skills.test.ts + pnpm typecheck + pnpm build passing" } } } @@ -1698,7 +1751,7 @@ }, "overall_progress": { - "total_test_count": 1586, + "total_test_count": 1597, "all_tests_passing": true, "p0_completion": "3/3 (100%)", "p1_completion": "4/4 (100%)", @@ -1714,11 +1767,11 @@ "tier3_completion": "5/5 (100%) — lane queue, credential redaction, web UI token dashboard, xAI (Grok) provider, Voyage AI embeddings", "tier4_completion": "4/4 (100%) — gateway lock, shell completion, Tailscale Serve/Funnel, DM pairing codes", "feature_gap_scorecard": "100/128 match (78%), 0 partial (0%), 28 missing (22%)", - "operator_dx_milestone": "Phase 3 (Live Ops Dashboard): 1/2 plans complete — metrics backend done, dashboard UI next", + "operator_dx_milestone": "Phase 3 (Live Ops Dashboard): 2/2 plans complete — milestone done", "gmail_auth_cli": "flynn gmail-auth command implemented with OAuth2 flow, doctor check, config routed to Telegram", "native_audio_support": "completed — smart routing for native audio (Gemini/OpenAI/GitHub) vs Whisper transcription fallback", - "remaining_phases_completion": "Phase 1: 3/3 (100%) — context levels, command registry, memory structure. Phase 2: 2/2 (100%) — component registry, confidence routing. Phase 3: 2/2 (100%) — adaptive memory/compaction, truthfulness/autonomy hardening", - "next_up": "Skills infrastructure follow-up: expose promotion-policy status as a dedicated machine-readable contract for automation consumers (e.g., CI gate or dashboard ingest) before broader shell-runner rollout" + "remaining_phases_completion": "Phase 1: 3/3 (100%) — context levels, command registry, memory structure. Phase 2: 3/3 (100%) — component registry, confidence routing, history index. Phase 3: 2/2 (100%) — adaptive memory/compaction, truthfulness/autonomy hardening", + "next_up": "Define next milestone and create a new implementation checklist (all remaining-phases PR slices complete)" }, "soul_md_and_cron_create": { "date": "2026-02-11", diff --git a/src/cli/skills.test.ts b/src/cli/skills.test.ts index 45fb950..f5deb5d 100644 --- a/src/cli/skills.test.ts +++ b/src/cli/skills.test.ts @@ -35,6 +35,7 @@ import { recommendShellRunnerRolloutPhase, sanitizeSkillInstallerAuditReason, summarizeShellRunnerAuditWindow, + toShellRunnerPromotionContract, resolveSkillInstallerCommandRunner, runSkillExecuteAction, runSkillInstallAction, @@ -753,6 +754,70 @@ describe('skills CLI helpers', () => { expect(policy.blockers).toContain('failures increased by 1 vs previous window'); }); + it('builds machine-readable promotion contract with gate status and blockers', () => { + const contract = toShellRunnerPromotionContract({ + generatedAt: '2026-02-13T00:00:00.000Z', + days: 7, + recommendation: 'guarded_review', + guardrails: { blockers: ['skills.installation_execution must be enabled'] }, + summary: { + command_result_total: 4, + command_result_failed: 1, + allowlist_blocked: 0, + execution_blocked: 0, + hashed_command_count: 3, + unhashed_command_count: 1, + }, + trend: { + current: { + command_result_total: 4, + command_result_failed: 1, + allowlist_blocked: 0, + execution_blocked: 0, + hashed_command_count: 3, + unhashed_command_count: 1, + }, + previous: { + command_result_total: 4, + command_result_failed: 0, + allowlist_blocked: 0, + execution_blocked: 0, + hashed_command_count: 4, + unhashed_command_count: 0, + }, + deltas: { + failures: 1, + allowlist_blocks: 0, + hash_coverage_pct: -25, + }, + }, + promotionPolicy: { + eligible: false, + recommendation: 'not_eligible', + cadence_days: 7, + reviewed_window_days: 7, + success_rate: 0.75, + minimum_success_rate: 0.9, + failures_delta: 1, + allowlist_blocks_delta: 0, + hash_coverage_delta_pct: -25, + blockers: ['success rate 75.00% below minimum 90.00%'], + }, + governance: { + owner: 'skills-team', + review_cadence_days: 7, + promotion_min_success_rate: 0.9, + }, + }); + + expect(contract.schema).toBe('skills.rollout.promotion_contract.v1'); + expect(contract.gate.status).toBe('fail'); + expect(contract.gate.exit_code).toBe(1); + expect(contract.gate.blockers).toContain('skills.installation_execution must be enabled'); + expect(contract.gate.blockers).toContain('success rate 75.00% below minimum 90.00%'); + expect(contract.summary.hash_coverage_pct).toBe(75); + }); + it('marks promotion policy eligible when thresholds and trends are healthy', () => { const policy = evaluateShellRunnerPromotionPolicy({ trend: { @@ -2161,6 +2226,92 @@ describe('skills CLI helpers', () => { rmSync(root, { recursive: true, force: true }); }); + it('skills rollout-status emits dedicated promotion contract JSON with exit code', async () => { + const root = mkdtempSync(join(tmpdir(), 'flynn-skills-cli-')); + const configPath = join(root, 'config.yaml'); + const managedDir = join(root, 'managed'); + const bundledDir = join(root, 'bundled'); + const workspaceDir = join(root, 'workspace'); + const auditPath = join(root, 'audit.log'); + mkdirSync(managedDir, { recursive: true }); + mkdirSync(bundledDir, { recursive: true }); + mkdirSync(workspaceDir, { recursive: true }); + writeFileSync(auditPath, '', 'utf-8'); + writeSkillsCliConfig(configPath, { + managedDir, + bundledDir, + workspaceDir, + installationExecution: 'enabled', + allowShellRunner: true, + shellRunnerAllowlist: ['npm install*'], + shellRunnerGovernanceOwner: 'skills-team', + auditPath, + }); + + const program = new Command(); + registerSkillsCommand(program); + + const logSpy = vi.spyOn(console, 'log').mockImplementation(() => undefined); + process.exitCode = undefined; + + await program.parseAsync(['skills', 'rollout-status', '--contract', '-c', configPath], { from: 'user' }); + + const payload = JSON.parse(String(logSpy.mock.calls[0]?.[0])); + expect(payload.schema).toBe('skills.rollout.promotion_contract.v1'); + expect(payload.gate.status).toBe('fail'); + expect(payload.gate.exit_code).toBe(1); + expect(payload.governance.owner).toBe('skills-team'); + expect(process.exitCode).toBe(1); + + logSpy.mockRestore(); + process.exitCode = undefined; + rmSync(root, { recursive: true, force: true }); + }); + + it('skills rollout-status writes dedicated promotion contract to output file', async () => { + const root = mkdtempSync(join(tmpdir(), 'flynn-skills-cli-')); + const configPath = join(root, 'config.yaml'); + const managedDir = join(root, 'managed'); + const bundledDir = join(root, 'bundled'); + const workspaceDir = join(root, 'workspace'); + const auditPath = join(root, 'audit.log'); + const outputPath = join(root, 'rollout-contract.json'); + mkdirSync(managedDir, { recursive: true }); + mkdirSync(bundledDir, { recursive: true }); + mkdirSync(workspaceDir, { recursive: true }); + writeFileSync(auditPath, '', 'utf-8'); + writeSkillsCliConfig(configPath, { + managedDir, + bundledDir, + workspaceDir, + installationExecution: 'enabled', + allowShellRunner: true, + shellRunnerAllowlist: ['npm install*'], + shellRunnerGovernanceOwner: 'skills-team', + auditPath, + }); + + const program = new Command(); + registerSkillsCommand(program); + + const logSpy = vi.spyOn(console, 'log').mockImplementation(() => undefined); + process.exitCode = undefined; + + await program.parseAsync(['skills', 'rollout-status', '--contract', '--out', outputPath, '-c', configPath], { + from: 'user', + }); + + expect(existsSync(outputPath)).toBe(true); + const payload = JSON.parse(readFileSync(outputPath, 'utf-8')); + expect(payload.schema).toBe('skills.rollout.promotion_contract.v1'); + expect(payload.gate).toBeDefined(); + expect(payload.summary).toBeDefined(); + + logSpy.mockRestore(); + process.exitCode = undefined; + rmSync(root, { recursive: true, force: true }); + }); + it('skills rollout-status includes trend deltas across adjacent windows', async () => { const root = mkdtempSync(join(tmpdir(), 'flynn-skills-cli-')); const configPath = join(root, 'config.yaml'); diff --git a/src/cli/skills.ts b/src/cli/skills.ts index d84a788..674b78a 100644 --- a/src/cli/skills.ts +++ b/src/cli/skills.ts @@ -132,6 +132,32 @@ export interface ShellRunnerPromotionPolicyStatus { blockers: string[]; } +export interface ShellRunnerPromotionContract { + schema: 'skills.rollout.promotion_contract.v1'; + generated_at: string; + window_days: number; + gate: { + status: 'pass' | 'fail'; + exit_code: 0 | 1; + reason: 'promotion_eligible' | 'promotion_not_eligible'; + blockers: string[]; + }; + recommendation: ShellRunnerRolloutRecommendation; + governance: { + owner: string | null; + review_cadence_days: number; + promotion_min_success_rate: number; + }; + summary: { + command_result_total: number; + command_result_failed: number; + allowlist_blocked: number; + hash_coverage_pct: number; + }; + promotion_policy: ShellRunnerPromotionPolicyStatus; + trend: ShellRunnerAuditTrendSnapshot['deltas']; +} + export type ShellRunnerRolloutRecommendation = 'locked' | 'guarded_observe' | 'guarded_review' | 'expand_candidate'; export function evaluateShellRunnerRolloutGuardrails( @@ -321,6 +347,50 @@ export function recommendShellRunnerRolloutPhase( return 'expand_candidate'; } +export function toShellRunnerPromotionContract(args: { + generatedAt: string; + days: number; + recommendation: ShellRunnerRolloutRecommendation; + guardrails: ShellRunnerRolloutGuardrailStatus; + summary: ShellRunnerAuditWindowSummary; + trend: ShellRunnerAuditTrendSnapshot; + promotionPolicy: ShellRunnerPromotionPolicyStatus; + governance: { + owner: string | null; + review_cadence_days: number; + promotion_min_success_rate: number; + }; +}): ShellRunnerPromotionContract { + const blockers = [...args.guardrails.blockers, ...args.promotionPolicy.blockers]; + const eligible = args.promotionPolicy.eligible && blockers.length === 0; + + return { + schema: 'skills.rollout.promotion_contract.v1', + generated_at: args.generatedAt, + window_days: args.days, + gate: { + status: eligible ? 'pass' : 'fail', + exit_code: eligible ? 0 : 1, + reason: eligible ? 'promotion_eligible' : 'promotion_not_eligible', + blockers, + }, + recommendation: args.recommendation, + governance: { + owner: args.governance.owner, + review_cadence_days: args.governance.review_cadence_days, + promotion_min_success_rate: args.governance.promotion_min_success_rate, + }, + summary: { + command_result_total: args.summary.command_result_total, + command_result_failed: args.summary.command_result_failed, + allowlist_blocked: args.summary.allowlist_blocked, + hash_coverage_pct: calculateShellRunnerHashCoveragePercent(args.summary), + }, + promotion_policy: args.promotionPolicy, + trend: args.trend.deltas, + }; +} + function expandHomePath(pathValue: string): string { if (pathValue.startsWith('~/')) { return resolve(homedir(), pathValue.slice(2)); @@ -1337,10 +1407,11 @@ export function registerSkillsCommand(program: Command): void { .command('rollout-status') .description('Show shell runner rollout guardrails and audit review summary') .option('--days ', 'Look back N days in audit logs (default: 7)', '7') + .option('--contract', 'Output dedicated machine-readable promotion contract JSON') .option('--out ', 'Write rollout JSON payload to file') .option('--json', 'Output as JSON') .option('-c, --config ', 'Config file path') - .action(async (opts: { days?: string; out?: string; json?: boolean; config?: string }) => { + .action(async (opts: { days?: string; contract?: boolean; out?: string; json?: boolean; config?: string }) => { const loaded = loadConfigSafe(opts.config); if (loaded.error || !loaded.config) { console.error(loaded.error ?? 'Failed to load config'); @@ -1380,7 +1451,9 @@ export function registerSkillsCommand(program: Command): void { promotion_min_success_rate: governance.promotion_min_success_rate, }, }); + const generatedAt = new Date(nowMs).toISOString(); const rolloutPayload = { + generated_at: generatedAt, days: parsedDays, guardrails, summary: trend.current, @@ -1393,9 +1466,33 @@ export function registerSkillsCommand(program: Command): void { promotion_min_success_rate: governance.promotion_min_success_rate, }, }; + const promotionContract = toShellRunnerPromotionContract({ + generatedAt, + days: parsedDays, + recommendation, + guardrails, + summary: trend.current, + trend, + promotionPolicy, + governance: { + owner: governance.owner ?? null, + review_cadence_days: governance.review_cadence_days, + promotion_min_success_rate: governance.promotion_min_success_rate, + }, + }); if (opts.out) { - writeFileSync(expandHomePath(opts.out), JSON.stringify(rolloutPayload, null, 2), 'utf-8'); + writeFileSync( + expandHomePath(opts.out), + JSON.stringify(opts.contract ? promotionContract : rolloutPayload, null, 2), + 'utf-8', + ); + } + + if (opts.contract) { + console.log(JSON.stringify(promotionContract, null, 2)); + process.exitCode = promotionContract.gate.exit_code; + return; } if (opts.json) {