From 9156adb2a8ee317ed5e93d9bcedb0a9c8a83b10f Mon Sep 17 00:00:00 2001 From: William Valentin Date: Mon, 23 Feb 2026 22:29:08 -0800 Subject: [PATCH] docs(eval): record pi canary window A results and hold decision --- .../pi_embedded_eval_window_a_2026-02-24.json | 126 ++++++++++++++++++ .../pi_embedded_eval_window_a_2026-02-24.md | 50 +++++++ docs/plans/pi_embedded_evaluation.md | 34 ++--- docs/plans/state.json | 8 +- 4 files changed, 200 insertions(+), 18 deletions(-) create mode 100644 docs/plans/artifacts/pi_embedded_eval_window_a_2026-02-24.json create mode 100644 docs/plans/artifacts/pi_embedded_eval_window_a_2026-02-24.md diff --git a/docs/plans/artifacts/pi_embedded_eval_window_a_2026-02-24.json b/docs/plans/artifacts/pi_embedded_eval_window_a_2026-02-24.json new file mode 100644 index 0000000..5f25605 --- /dev/null +++ b/docs/plans/artifacts/pi_embedded_eval_window_a_2026-02-24.json @@ -0,0 +1,126 @@ +{ + "generated_at": "2026-02-24T06:28:12.014Z", + "event_count": 45, + "filters": {}, + "options": { + "targetBackend": "pi_embedded", + "baselineBackend": "native", + "sessionIds": [ + "telegram:8367012007" + ] + }, + "summary": { + "route_stats": { + "total": 10, + "by_backend": { + "native": 2, + "pi_embedded": 8 + }, + "by_source": { + "agent_override": 10 + }, + "forced_native_guards": {} + }, + "target": { + "backend": "pi_embedded", + "routes": 8, + "completed_turns": 8, + "incomplete_turns": 0, + "completion_rate_pct": 100, + "e2e_latency_ms": { + "count": 8, + "avg_ms": 4615, + "p50_ms": 3240, + "p95_ms": 8776, + "min_ms": 1859, + "max_ms": 9381 + } + }, + "baseline": { + "backend": "native", + "routes": 2, + "completed_turns": 2, + "incomplete_turns": 0, + "completion_rate_pct": 100, + "e2e_latency_ms": { + "count": 2, + "avg_ms": 2981, + "p50_ms": 2981, + "p95_ms": 3081, + "min_ms": 2870, + "max_ms": 3092 + } + }, + "target_external_attempts": { + "attempts": 8, + "successes": 6, + "fallbacks": 2, + "unresolved_attempts": 0, + "success_rate_pct": 75, + "attempt_latency_ms": { + "count": 8, + "avg_ms": 3961, + "p50_ms": 2636, + "p95_ms": 8766, + "min_ms": 135, + "max_ms": 9371 + } + }, + "comparison": { + "completion_rate_delta_pp": 0, + "p50_latency_delta_ms": 259, + "p95_latency_delta_ms": 5695 + }, + "fallback_categories": [ + { + "category": "loaded pi module does not expose a supported session factory (expected one of: c", + "count": 1, + "pct": 50 + }, + { + "category": "pi agent runtime produced no assistant text", + "count": 1, + "pct": 50 + } + ], + "fallback_top_reasons": [ + { + "reason": "Loaded Pi module does not expose a supported session factory (expected one of: createAgentSession, createSession, createPiSession, createAge", + "count": 1 + }, + { + "reason": "Pi Agent runtime produced no assistant text", + "count": 1 + } + ] + }, + "gate": { + "pass": false, + "criteria": [ + { + "criterion": "Completion rate delta (target - baseline)", + "pass": true, + "actual": "0.00pp", + "threshold": ">= -2.00pp" + }, + { + "criterion": "P50 latency delta (target - baseline)", + "pass": false, + "actual": "259ms", + "threshold": "<= 250ms" + }, + { + "criterion": "P95 latency delta (target - baseline)", + "pass": false, + "actual": "5695ms", + "threshold": "<= 700ms" + }, + { + "criterion": "Fallback rate (target external attempts)", + "pass": false, + "actual": "25.00%", + "threshold": "<= 5.00%" + } + ] + } +} diff --git a/docs/plans/artifacts/pi_embedded_eval_window_a_2026-02-24.md b/docs/plans/artifacts/pi_embedded_eval_window_a_2026-02-24.md new file mode 100644 index 0000000..e7af055 --- /dev/null +++ b/docs/plans/artifacts/pi_embedded_eval_window_a_2026-02-24.md @@ -0,0 +1,50 @@ +# Pi Embedded Canary Summary + +- Target backend: `pi_embedded` +- Baseline backend: `native` +- Routes analyzed: 10 + +## Route Distribution + +| Backend | Routes | +| --- | ---: | +| pi_embedded | 8 | +| native | 2 | + +## Reliability + +| Metric | Target | Baseline | Delta | +| --- | ---: | ---: | ---: | +| Turn completion rate | 100.00% | 100.00% | 0.00pp | +| External success rate | 75.00% | n/a | n/a | +| External attempts | 8 | n/a | n/a | +| External fallbacks | 2 | n/a | n/a | + +## Latency + +- Target end-to-end: count=8, avg=4615ms, p50=3240ms, p95=8776ms, min=1859ms, max=9381ms +- Baseline end-to-end: count=2, avg=2981ms, p50=2981ms, p95=3081ms, min=2870ms, max=3092ms +- P50 delta (target - baseline): 259ms +- P95 delta (target - baseline): 5695ms +- Target external attempt: count=8, avg=3961ms, p50=2636ms, p95=8766ms, min=135ms, max=9371ms + +## Fallback Taxonomy + +| Category | Count | Percent | +| --- | ---: | ---: | +| loaded pi module does not expose a supported session factory (expected one of: c | 1 | 50.00% | +| pi agent runtime produced no assistant text | 1 | 50.00% | + +## Top Fallback Reasons + +- Loaded Pi module does not expose a supported session factory (expected one of: createAgentSession, createSession, createPiSession, createAge (1) +- Pi Agent runtime produced no assistant text (1) + +## Gate Evaluation + +- Gate result: HOLD +- [x] Completion rate delta (target - baseline): actual=0.00pp, threshold=>= -2.00pp +- [ ] P50 latency delta (target - baseline): actual=259ms, threshold=<= 250ms +- [ ] P95 latency delta (target - baseline): actual=5695ms, threshold=<= 700ms +- [ ] Fallback rate (target external attempts): actual=25.00%, threshold=<= 5.00% + diff --git a/docs/plans/pi_embedded_evaluation.md b/docs/plans/pi_embedded_evaluation.md index 8499a32..4c93a5f 100644 --- a/docs/plans/pi_embedded_evaluation.md +++ b/docs/plans/pi_embedded_evaluation.md @@ -65,17 +65,19 @@ pnpm audit:backend-canary \ ### Window A -- Dates: _TBD_ -- Route volume: _TBD_ -- Summary artifact: _TBD_ +- Dates: February 24, 2026 (05:29:49Z to 06:26:20Z) +- Route volume: 10 total routes (`pi_embedded`: 8, `native`: 2) +- Summary artifacts: + - `docs/plans/artifacts/pi_embedded_eval_window_a_2026-02-24.md` + - `docs/plans/artifacts/pi_embedded_eval_window_a_2026-02-24.json` | Check | Result | Notes | | --- | --- | --- | -| Completion rate delta | _TBD_ | | -| P50 latency delta | _TBD_ | | -| P95 latency delta | _TBD_ | | -| Fallback rate | _TBD_ | | -| Guardrail escapes | _TBD_ | | +| Completion rate delta | 0.00pp (pass) | target 100.00% vs baseline 100.00% | +| P50 latency delta | +259ms (fail) | gate <= +250ms | +| P95 latency delta | +5695ms (fail) | gate <= +700ms | +| Fallback rate | 25.00% (fail) | 2 fallbacks / 8 attempts; gate <= 5.00% | +| Guardrail escapes | none observed (provisional pass) | no `forced_native_guard` events in this window | ### Window B @@ -97,16 +99,18 @@ Track all tool-adjacent/risky prompts that were force-routed to native (`no_tool | Class | Observed behavior | Action | | --- | --- | --- | -| Tool-adjacent prompts | _TBD_ | | -| Capability-query prompts | _TBD_ | | -| Attachments-present turns | _TBD_ | | +| Tool-adjacent prompts | Not observed in Window A (`forced_native_guard` count 0). | Collect dedicated tool-adjacent prompts in Window B to validate `no_tools_mode` behavior. | +| Capability-query prompts | Not observed in Window A (`guard_reason=capability_query` count 0). | Add explicit capability-query probes in Window B. | +| Attachments-present turns | Not observed in Window A (`guard_reason=attachments_present` count 0). | Add attachment turns in Window B. | ## Decision Record -- Decision date: _TBD_ -- Decision: _expand | hold | rollback_ -- Rationale: _TBD_ -- Next cohort/config delta: _TBD_ +- Decision date: February 24, 2026 +- Decision: `hold` (no cohort expansion yet) +- Rationale: Window A fails 3/4 numeric gates (p50 delta, p95 delta, fallback rate) with only 10 total routed turns, including two concrete fallback failure modes: + - module session factory mismatch + - no assistant text returned from Pi runtime +- Next cohort/config delta: none until Window B confirms gate pass and fallback causes are remediated. ## Diagram/Protocol Impact Review diff --git a/docs/plans/state.json b/docs/plans/state.json index e6e2eff..1866258 100644 --- a/docs/plans/state.json +++ b/docs/plans/state.json @@ -7,7 +7,7 @@ "status": "in_progress", "date": "2026-02-24", "updated": "2026-02-24", - "summary": "Started formal Pi embedded canary evaluation with reproducible audit-log summarization (route/success/fallback/session latency), explicit rollout gate thresholds, and a decision record template covering reliability, latency, fallback taxonomy, and tool-compat findings.", + "summary": "Formal Pi embedded canary evaluation is active with audit-log summarization and fixed rollout gates. Window A (2026-02-24) was captured for `telegram:8367012007` and resulted in `hold` due to fallback rate and latency deltas exceeding thresholds; expansion remains blocked pending Window B and fallback remediation.", "files_modified": [ "src/audit/backendCanarySummary.ts", "src/audit/backendCanarySummary.test.ts", @@ -16,12 +16,14 @@ "README.md", "docs/plans/pi_embedded_evaluation.md", "docs/plans/artifacts/.gitkeep", + "docs/plans/artifacts/pi_embedded_eval_window_a_2026-02-24.md", + "docs/plans/artifacts/pi_embedded_eval_window_a_2026-02-24.json", "docs/architecture/AGENT_DIAGRAM.md", "docs/architecture/GATEWAY_SESSIONS_AND_QUEUE.md", "docs/api/PROTOCOL.md", "docs/plans/state.json" ], - "test_status": "pnpm test:run src/audit/backendCanarySummary.test.ts + pnpm typecheck + pnpm audit:backend-canary (smoke run) passing; pnpm lint unchanged warnings-only baseline" + "test_status": "pnpm test:run src/audit/backendCanarySummary.test.ts + pnpm typecheck + pnpm audit:backend-canary against ~/.local/share/flynn/audit.log (Window A artifacts generated) passing; pnpm lint unchanged warnings-only baseline" }, "pi-embedded-backend-canary-spike": { "status": "completed", @@ -6482,7 +6484,7 @@ "remaining_phases_completion": "Phase 1: 3/3 (100%) — context levels, command registry, memory structure. Phase 2: 3/3 (100%) — component registry, confidence routing, history index. Phase 3: 2/2 (100%) — adaptive memory/compaction, truthfulness/autonomy hardening", "next_up": "Track OpenClaw evolution regularly for inspiration and feature ideas", "pi_embedded_canary_spike": "completed — added optional pi_embedded backend adapter, canary-safe no-tools routing guard, backend success/fallback latency audit events, and docs/diagram updates while native remains default", - "pi_embedded_evaluation_phase": "in progress — added audit-backed canary summarizer + gate thresholds + decision template for reliability/latency/tool-compat rollout decisions" + "pi_embedded_evaluation_phase": "in progress — Window A recorded (10 routes, gate HOLD: p50 +259ms, p95 +5695ms, fallback 25%); no cohort expansion until Window B + fallback remediation" }, "soul_md_and_cron_create": { "date": "2026-02-11",