docs(eval): record pi canary window A results and hold decision
This commit is contained in:
@@ -0,0 +1,126 @@
|
|||||||
|
{
|
||||||
|
"generated_at": "2026-02-24T06:28:12.014Z",
|
||||||
|
"event_count": 45,
|
||||||
|
"filters": {},
|
||||||
|
"options": {
|
||||||
|
"targetBackend": "pi_embedded",
|
||||||
|
"baselineBackend": "native",
|
||||||
|
"sessionIds": [
|
||||||
|
"telegram:8367012007"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"summary": {
|
||||||
|
"route_stats": {
|
||||||
|
"total": 10,
|
||||||
|
"by_backend": {
|
||||||
|
"native": 2,
|
||||||
|
"pi_embedded": 8
|
||||||
|
},
|
||||||
|
"by_source": {
|
||||||
|
"agent_override": 10
|
||||||
|
},
|
||||||
|
"forced_native_guards": {}
|
||||||
|
},
|
||||||
|
"target": {
|
||||||
|
"backend": "pi_embedded",
|
||||||
|
"routes": 8,
|
||||||
|
"completed_turns": 8,
|
||||||
|
"incomplete_turns": 0,
|
||||||
|
"completion_rate_pct": 100,
|
||||||
|
"e2e_latency_ms": {
|
||||||
|
"count": 8,
|
||||||
|
"avg_ms": 4615,
|
||||||
|
"p50_ms": 3240,
|
||||||
|
"p95_ms": 8776,
|
||||||
|
"min_ms": 1859,
|
||||||
|
"max_ms": 9381
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"baseline": {
|
||||||
|
"backend": "native",
|
||||||
|
"routes": 2,
|
||||||
|
"completed_turns": 2,
|
||||||
|
"incomplete_turns": 0,
|
||||||
|
"completion_rate_pct": 100,
|
||||||
|
"e2e_latency_ms": {
|
||||||
|
"count": 2,
|
||||||
|
"avg_ms": 2981,
|
||||||
|
"p50_ms": 2981,
|
||||||
|
"p95_ms": 3081,
|
||||||
|
"min_ms": 2870,
|
||||||
|
"max_ms": 3092
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"target_external_attempts": {
|
||||||
|
"attempts": 8,
|
||||||
|
"successes": 6,
|
||||||
|
"fallbacks": 2,
|
||||||
|
"unresolved_attempts": 0,
|
||||||
|
"success_rate_pct": 75,
|
||||||
|
"attempt_latency_ms": {
|
||||||
|
"count": 8,
|
||||||
|
"avg_ms": 3961,
|
||||||
|
"p50_ms": 2636,
|
||||||
|
"p95_ms": 8766,
|
||||||
|
"min_ms": 135,
|
||||||
|
"max_ms": 9371
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"comparison": {
|
||||||
|
"completion_rate_delta_pp": 0,
|
||||||
|
"p50_latency_delta_ms": 259,
|
||||||
|
"p95_latency_delta_ms": 5695
|
||||||
|
},
|
||||||
|
"fallback_categories": [
|
||||||
|
{
|
||||||
|
"category": "loaded pi module does not expose a supported session factory (expected one of: c",
|
||||||
|
"count": 1,
|
||||||
|
"pct": 50
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"category": "pi agent runtime produced no assistant text",
|
||||||
|
"count": 1,
|
||||||
|
"pct": 50
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fallback_top_reasons": [
|
||||||
|
{
|
||||||
|
"reason": "Loaded Pi module does not expose a supported session factory (expected one of: createAgentSession, createSession, createPiSession, createAge",
|
||||||
|
"count": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"reason": "Pi Agent runtime produced no assistant text",
|
||||||
|
"count": 1
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"gate": {
|
||||||
|
"pass": false,
|
||||||
|
"criteria": [
|
||||||
|
{
|
||||||
|
"criterion": "Completion rate delta (target - baseline)",
|
||||||
|
"pass": true,
|
||||||
|
"actual": "0.00pp",
|
||||||
|
"threshold": ">= -2.00pp"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"criterion": "P50 latency delta (target - baseline)",
|
||||||
|
"pass": false,
|
||||||
|
"actual": "259ms",
|
||||||
|
"threshold": "<= 250ms"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"criterion": "P95 latency delta (target - baseline)",
|
||||||
|
"pass": false,
|
||||||
|
"actual": "5695ms",
|
||||||
|
"threshold": "<= 700ms"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"criterion": "Fallback rate (target external attempts)",
|
||||||
|
"pass": false,
|
||||||
|
"actual": "25.00%",
|
||||||
|
"threshold": "<= 5.00%"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,50 @@
|
|||||||
|
# Pi Embedded Canary Summary
|
||||||
|
|
||||||
|
- Target backend: `pi_embedded`
|
||||||
|
- Baseline backend: `native`
|
||||||
|
- Routes analyzed: 10
|
||||||
|
|
||||||
|
## Route Distribution
|
||||||
|
|
||||||
|
| Backend | Routes |
|
||||||
|
| --- | ---: |
|
||||||
|
| pi_embedded | 8 |
|
||||||
|
| native | 2 |
|
||||||
|
|
||||||
|
## Reliability
|
||||||
|
|
||||||
|
| Metric | Target | Baseline | Delta |
|
||||||
|
| --- | ---: | ---: | ---: |
|
||||||
|
| Turn completion rate | 100.00% | 100.00% | 0.00pp |
|
||||||
|
| External success rate | 75.00% | n/a | n/a |
|
||||||
|
| External attempts | 8 | n/a | n/a |
|
||||||
|
| External fallbacks | 2 | n/a | n/a |
|
||||||
|
|
||||||
|
## Latency
|
||||||
|
|
||||||
|
- Target end-to-end: count=8, avg=4615ms, p50=3240ms, p95=8776ms, min=1859ms, max=9381ms
|
||||||
|
- Baseline end-to-end: count=2, avg=2981ms, p50=2981ms, p95=3081ms, min=2870ms, max=3092ms
|
||||||
|
- P50 delta (target - baseline): 259ms
|
||||||
|
- P95 delta (target - baseline): 5695ms
|
||||||
|
- Target external attempt: count=8, avg=3961ms, p50=2636ms, p95=8766ms, min=135ms, max=9371ms
|
||||||
|
|
||||||
|
## Fallback Taxonomy
|
||||||
|
|
||||||
|
| Category | Count | Percent |
|
||||||
|
| --- | ---: | ---: |
|
||||||
|
| loaded pi module does not expose a supported session factory (expected one of: c | 1 | 50.00% |
|
||||||
|
| pi agent runtime produced no assistant text | 1 | 50.00% |
|
||||||
|
|
||||||
|
## Top Fallback Reasons
|
||||||
|
|
||||||
|
- Loaded Pi module does not expose a supported session factory (expected one of: createAgentSession, createSession, createPiSession, createAge (1)
|
||||||
|
- Pi Agent runtime produced no assistant text (1)
|
||||||
|
|
||||||
|
## Gate Evaluation
|
||||||
|
|
||||||
|
- Gate result: HOLD
|
||||||
|
- [x] Completion rate delta (target - baseline): actual=0.00pp, threshold=>= -2.00pp
|
||||||
|
- [ ] P50 latency delta (target - baseline): actual=259ms, threshold=<= 250ms
|
||||||
|
- [ ] P95 latency delta (target - baseline): actual=5695ms, threshold=<= 700ms
|
||||||
|
- [ ] Fallback rate (target external attempts): actual=25.00%, threshold=<= 5.00%
|
||||||
|
|
||||||
@@ -65,17 +65,19 @@ pnpm audit:backend-canary \
|
|||||||
|
|
||||||
### Window A
|
### Window A
|
||||||
|
|
||||||
- Dates: _TBD_
|
- Dates: February 24, 2026 (05:29:49Z to 06:26:20Z)
|
||||||
- Route volume: _TBD_
|
- Route volume: 10 total routes (`pi_embedded`: 8, `native`: 2)
|
||||||
- Summary artifact: _TBD_
|
- Summary artifacts:
|
||||||
|
- `docs/plans/artifacts/pi_embedded_eval_window_a_2026-02-24.md`
|
||||||
|
- `docs/plans/artifacts/pi_embedded_eval_window_a_2026-02-24.json`
|
||||||
|
|
||||||
| Check | Result | Notes |
|
| Check | Result | Notes |
|
||||||
| --- | --- | --- |
|
| --- | --- | --- |
|
||||||
| Completion rate delta | _TBD_ | |
|
| Completion rate delta | 0.00pp (pass) | target 100.00% vs baseline 100.00% |
|
||||||
| P50 latency delta | _TBD_ | |
|
| P50 latency delta | +259ms (fail) | gate <= +250ms |
|
||||||
| P95 latency delta | _TBD_ | |
|
| P95 latency delta | +5695ms (fail) | gate <= +700ms |
|
||||||
| Fallback rate | _TBD_ | |
|
| Fallback rate | 25.00% (fail) | 2 fallbacks / 8 attempts; gate <= 5.00% |
|
||||||
| Guardrail escapes | _TBD_ | |
|
| Guardrail escapes | none observed (provisional pass) | no `forced_native_guard` events in this window |
|
||||||
|
|
||||||
### Window B
|
### Window B
|
||||||
|
|
||||||
@@ -97,16 +99,18 @@ Track all tool-adjacent/risky prompts that were force-routed to native (`no_tool
|
|||||||
|
|
||||||
| Class | Observed behavior | Action |
|
| Class | Observed behavior | Action |
|
||||||
| --- | --- | --- |
|
| --- | --- | --- |
|
||||||
| Tool-adjacent prompts | _TBD_ | |
|
| Tool-adjacent prompts | Not observed in Window A (`forced_native_guard` count 0). | Collect dedicated tool-adjacent prompts in Window B to validate `no_tools_mode` behavior. |
|
||||||
| Capability-query prompts | _TBD_ | |
|
| Capability-query prompts | Not observed in Window A (`guard_reason=capability_query` count 0). | Add explicit capability-query probes in Window B. |
|
||||||
| Attachments-present turns | _TBD_ | |
|
| Attachments-present turns | Not observed in Window A (`guard_reason=attachments_present` count 0). | Add attachment turns in Window B. |
|
||||||
|
|
||||||
## Decision Record
|
## Decision Record
|
||||||
|
|
||||||
- Decision date: _TBD_
|
- Decision date: February 24, 2026
|
||||||
- Decision: _expand | hold | rollback_
|
- Decision: `hold` (no cohort expansion yet)
|
||||||
- Rationale: _TBD_
|
- Rationale: Window A fails 3/4 numeric gates (p50 delta, p95 delta, fallback rate) with only 10 total routed turns, including two concrete fallback failure modes:
|
||||||
- Next cohort/config delta: _TBD_
|
- module session factory mismatch
|
||||||
|
- no assistant text returned from Pi runtime
|
||||||
|
- Next cohort/config delta: none until Window B confirms gate pass and fallback causes are remediated.
|
||||||
|
|
||||||
## Diagram/Protocol Impact Review
|
## Diagram/Protocol Impact Review
|
||||||
|
|
||||||
|
|||||||
@@ -7,7 +7,7 @@
|
|||||||
"status": "in_progress",
|
"status": "in_progress",
|
||||||
"date": "2026-02-24",
|
"date": "2026-02-24",
|
||||||
"updated": "2026-02-24",
|
"updated": "2026-02-24",
|
||||||
"summary": "Started formal Pi embedded canary evaluation with reproducible audit-log summarization (route/success/fallback/session latency), explicit rollout gate thresholds, and a decision record template covering reliability, latency, fallback taxonomy, and tool-compat findings.",
|
"summary": "Formal Pi embedded canary evaluation is active with audit-log summarization and fixed rollout gates. Window A (2026-02-24) was captured for `telegram:8367012007` and resulted in `hold` due to fallback rate and latency deltas exceeding thresholds; expansion remains blocked pending Window B and fallback remediation.",
|
||||||
"files_modified": [
|
"files_modified": [
|
||||||
"src/audit/backendCanarySummary.ts",
|
"src/audit/backendCanarySummary.ts",
|
||||||
"src/audit/backendCanarySummary.test.ts",
|
"src/audit/backendCanarySummary.test.ts",
|
||||||
@@ -16,12 +16,14 @@
|
|||||||
"README.md",
|
"README.md",
|
||||||
"docs/plans/pi_embedded_evaluation.md",
|
"docs/plans/pi_embedded_evaluation.md",
|
||||||
"docs/plans/artifacts/.gitkeep",
|
"docs/plans/artifacts/.gitkeep",
|
||||||
|
"docs/plans/artifacts/pi_embedded_eval_window_a_2026-02-24.md",
|
||||||
|
"docs/plans/artifacts/pi_embedded_eval_window_a_2026-02-24.json",
|
||||||
"docs/architecture/AGENT_DIAGRAM.md",
|
"docs/architecture/AGENT_DIAGRAM.md",
|
||||||
"docs/architecture/GATEWAY_SESSIONS_AND_QUEUE.md",
|
"docs/architecture/GATEWAY_SESSIONS_AND_QUEUE.md",
|
||||||
"docs/api/PROTOCOL.md",
|
"docs/api/PROTOCOL.md",
|
||||||
"docs/plans/state.json"
|
"docs/plans/state.json"
|
||||||
],
|
],
|
||||||
"test_status": "pnpm test:run src/audit/backendCanarySummary.test.ts + pnpm typecheck + pnpm audit:backend-canary (smoke run) passing; pnpm lint unchanged warnings-only baseline"
|
"test_status": "pnpm test:run src/audit/backendCanarySummary.test.ts + pnpm typecheck + pnpm audit:backend-canary against ~/.local/share/flynn/audit.log (Window A artifacts generated) passing; pnpm lint unchanged warnings-only baseline"
|
||||||
},
|
},
|
||||||
"pi-embedded-backend-canary-spike": {
|
"pi-embedded-backend-canary-spike": {
|
||||||
"status": "completed",
|
"status": "completed",
|
||||||
@@ -6482,7 +6484,7 @@
|
|||||||
"remaining_phases_completion": "Phase 1: 3/3 (100%) — context levels, command registry, memory structure. Phase 2: 3/3 (100%) — component registry, confidence routing, history index. Phase 3: 2/2 (100%) — adaptive memory/compaction, truthfulness/autonomy hardening",
|
"remaining_phases_completion": "Phase 1: 3/3 (100%) — context levels, command registry, memory structure. Phase 2: 3/3 (100%) — component registry, confidence routing, history index. Phase 3: 2/2 (100%) — adaptive memory/compaction, truthfulness/autonomy hardening",
|
||||||
"next_up": "Track OpenClaw evolution regularly for inspiration and feature ideas",
|
"next_up": "Track OpenClaw evolution regularly for inspiration and feature ideas",
|
||||||
"pi_embedded_canary_spike": "completed — added optional pi_embedded backend adapter, canary-safe no-tools routing guard, backend success/fallback latency audit events, and docs/diagram updates while native remains default",
|
"pi_embedded_canary_spike": "completed — added optional pi_embedded backend adapter, canary-safe no-tools routing guard, backend success/fallback latency audit events, and docs/diagram updates while native remains default",
|
||||||
"pi_embedded_evaluation_phase": "in progress — added audit-backed canary summarizer + gate thresholds + decision template for reliability/latency/tool-compat rollout decisions"
|
"pi_embedded_evaluation_phase": "in progress — Window A recorded (10 routes, gate HOLD: p50 +259ms, p95 +5695ms, fallback 25%); no cohort expansion until Window B + fallback remediation"
|
||||||
},
|
},
|
||||||
"soul_md_and_cron_create": {
|
"soul_md_and_cron_create": {
|
||||||
"date": "2026-02-11",
|
"date": "2026-02-11",
|
||||||
|
|||||||
Reference in New Issue
Block a user