From 787dd61a6d5b75571bee988d72ec62b98b1059e4 Mon Sep 17 00:00:00 2001 From: William Valentin Date: Wed, 25 Feb 2026 11:23:47 -0800 Subject: [PATCH] docs(rollout): add phase 4 readiness plan --- ...02-25-phase4-rollout-operator-readiness.md | 73 +++++++++++++++++++ docs/plans/state.json | 11 +++ 2 files changed, 84 insertions(+) create mode 100644 docs/plans/2026-02-25-phase4-rollout-operator-readiness.md diff --git a/docs/plans/2026-02-25-phase4-rollout-operator-readiness.md b/docs/plans/2026-02-25-phase4-rollout-operator-readiness.md new file mode 100644 index 0000000..d46a959 --- /dev/null +++ b/docs/plans/2026-02-25-phase4-rollout-operator-readiness.md @@ -0,0 +1,73 @@ +# Phase 4 Rollout + Operator Readiness (Deeper Surfaces) + +Date: 2026-02-25 + +## Summary + +This document provides the rollout plan, rollback playbook, and operator readiness checklist for the deeper end-user surfaces + integrated behavior stack workstreams (run-control, reactions v2, companion/canvas/voice). + +## Canary Rollout Plan + +### Guarded Rollout Steps + +1. **Run-control semantics (Phase 1)** +Toggle: `server.queue.mode: interrupt` only for canary sessions via `server.queue.overrides.sessions`. +Gate: `cancel-to-ack p95 <= 500ms`, zero duplicate final responses in integration tests. +Observe: `run_state` events (`start`, `cancel_requested`, `cancelled`, `complete`, `error`) in gateway UI + audit logs. + +2. **Reactions v2 (Phase 2)** +Toggle: restrict `automation.reactions` list to canary rules + scoped triggers. +Gate: reaction false-positive rate <= 3% in audit logs (`reactionMatch`, `reactionSkip`). +Observe: `system.metrics` reaction counters + recursion guard skip reasons. + +3. **Companion + Canvas (Phase 3)** +Toggle: `server.nodes.enabled: true` for companion canary nodes, enable `server.nodes.feature_gates.ui.canvas`. +Gate: companion reconnect success >= 99% in soak; canvas artifacts survive restart in integration runs. +Observe: node registration + capability logs; canvas list/get/put success in gateway UI. + +4. **Voice Continuity (Phase 3)** +Toggle: `tts.enabled: true` and `tts.enabled_channels` for canary channels; `audio.enabled: true` for inbound voice. +Gate: no dropped responses when TTS fails; text-only fallback confirmed in tests. +Observe: warning logs for TTS failures, reply delivery counts. + +### Rollout Cadence + +1. Week 1: enable canary on a single internal channel + 1-2 sessions. +2. Week 2: expand to 5-10% sessions/channels after gates hold. +3. Week 3: expand to 25-50% after second gate review. +4. Week 4: default-on unless gates fail; keep toggles for rollback. + +## Rollback Playbook + +1. **Run-control rollback** +Set `server.queue.mode: collect` globally. +Remove canary overrides in `server.queue.overrides.sessions`. + +2. **Reactions rollback** +Set `automation.reactions: []` or remove canary rules. +Verify `reactionMatch` count drops to zero. + +3. **Companion rollback** +Set `server.nodes.enabled: false` (or restrict `allowed_roles` to none). +Clear companion node registrations by restarting gateway. + +4. **Canvas rollback** +Disable `ui.canvas` in `server.nodes.feature_gates`. +Optional: archive/remove `dataDir/canvas` after capture if needed. + +5. **Voice rollback** +Set `tts.enabled: false` and/or remove `tts.enabled_channels`. +Set `audio.enabled: false` to stop inbound voice processing. + +## Operator Readiness Checklist + +Confirm protocol and architecture docs are synchronized (`docs/api/PROTOCOL.md`, `docs/architecture/AGENT_DIAGRAM.md`, `docs/architecture/GATEWAY_SESSIONS_AND_QUEUE.md`). +Verify audit logs and `system.metrics` are capturing `run_state` transitions, cancel latency buckets, and reaction match/skip reasons. +Validate canary tests: run-control queue preemption + cancel, reaction priority/cooldown, companion reconnect + re-register, canvas persistence across restart, TTS failure fallback. +Capture a before/after snapshot of error rate, cancellation latency, reaction false positives, companion reconnect success. + +## Owner + Comms + +- Primary owner: Flynn core team +- Canary checkpoint cadence: weekly +- Escalation: revert via rollback playbook within 1 hour of gate breach diff --git a/docs/plans/state.json b/docs/plans/state.json index 3bf5bbb..f5a9cd4 100644 --- a/docs/plans/state.json +++ b/docs/plans/state.json @@ -6751,6 +6751,17 @@ "docs/plans/state.json" ], "test_status": "pnpm test:run src/daemon/routing.test.ts passing" + }, + "deeper-surfaces-phase4-rollout-readiness": { + "status": "completed", + "date": "2026-02-25", + "updated": "2026-02-25", + "summary": "Documented Phase 4 rollout plan, rollback playbook, and operator readiness checklist for deeper surfaces and behavior stack changes.", + "files_modified": [ + "docs/plans/2026-02-25-phase4-rollout-operator-readiness.md", + "docs/plans/state.json" + ], + "test_status": "docs only" } }, "overall_progress": {