docs(eval): add window B telemetry slice and maintain hold decision
This commit is contained in:
@@ -0,0 +1,100 @@
|
||||
{
|
||||
"generated_at": "2026-02-24T06:30:22.439Z",
|
||||
"event_count": 24,
|
||||
"filters": {
|
||||
"since_ms": 1771913640000
|
||||
},
|
||||
"options": {
|
||||
"targetBackend": "pi_embedded",
|
||||
"baselineBackend": "native",
|
||||
"sessionIds": [
|
||||
"telegram:8367012007"
|
||||
]
|
||||
},
|
||||
"summary": {
|
||||
"route_stats": {
|
||||
"total": 6,
|
||||
"by_backend": {
|
||||
"pi_embedded": 6
|
||||
},
|
||||
"by_source": {
|
||||
"agent_override": 6
|
||||
},
|
||||
"forced_native_guards": {}
|
||||
},
|
||||
"target": {
|
||||
"backend": "pi_embedded",
|
||||
"routes": 6,
|
||||
"completed_turns": 6,
|
||||
"incomplete_turns": 0,
|
||||
"completion_rate_pct": 100,
|
||||
"e2e_latency_ms": {
|
||||
"count": 6,
|
||||
"avg_ms": 5212,
|
||||
"p50_ms": 5091,
|
||||
"p95_ms": 8949,
|
||||
"min_ms": 1859,
|
||||
"max_ms": 9381
|
||||
}
|
||||
},
|
||||
"baseline": {
|
||||
"backend": "native",
|
||||
"routes": 0,
|
||||
"completed_turns": 0,
|
||||
"incomplete_turns": 0,
|
||||
"completion_rate_pct": null,
|
||||
"e2e_latency_ms": null
|
||||
},
|
||||
"target_external_attempts": {
|
||||
"attempts": 6,
|
||||
"successes": 6,
|
||||
"fallbacks": 0,
|
||||
"unresolved_attempts": 0,
|
||||
"success_rate_pct": 100,
|
||||
"attempt_latency_ms": {
|
||||
"count": 6,
|
||||
"avg_ms": 5201,
|
||||
"p50_ms": 5082,
|
||||
"p95_ms": 8939,
|
||||
"min_ms": 1848,
|
||||
"max_ms": 9371
|
||||
}
|
||||
},
|
||||
"comparison": {
|
||||
"completion_rate_delta_pp": null,
|
||||
"p50_latency_delta_ms": null,
|
||||
"p95_latency_delta_ms": null
|
||||
},
|
||||
"fallback_categories": [],
|
||||
"fallback_top_reasons": []
|
||||
},
|
||||
"gate": {
|
||||
"pass": false,
|
||||
"criteria": [
|
||||
{
|
||||
"criterion": "Completion rate delta (target - baseline)",
|
||||
"pass": false,
|
||||
"actual": "n/a",
|
||||
"threshold": ">= -2.00pp"
|
||||
},
|
||||
{
|
||||
"criterion": "P50 latency delta (target - baseline)",
|
||||
"pass": false,
|
||||
"actual": "n/a",
|
||||
"threshold": "<= 250ms"
|
||||
},
|
||||
{
|
||||
"criterion": "P95 latency delta (target - baseline)",
|
||||
"pass": false,
|
||||
"actual": "n/a",
|
||||
"threshold": "<= 700ms"
|
||||
},
|
||||
{
|
||||
"criterion": "Fallback rate (target external attempts)",
|
||||
"pass": true,
|
||||
"actual": "0.00%",
|
||||
"threshold": "<= 5.00%"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
# Pi Embedded Canary Summary
|
||||
|
||||
- Target backend: `pi_embedded`
|
||||
- Baseline backend: `native`
|
||||
- Routes analyzed: 6
|
||||
|
||||
## Route Distribution
|
||||
|
||||
| Backend | Routes |
|
||||
| --- | ---: |
|
||||
| pi_embedded | 6 |
|
||||
|
||||
## Reliability
|
||||
|
||||
| Metric | Target | Baseline | Delta |
|
||||
| --- | ---: | ---: | ---: |
|
||||
| Turn completion rate | 100.00% | n/a | n/a |
|
||||
| External success rate | 100.00% | n/a | n/a |
|
||||
| External attempts | 6 | n/a | n/a |
|
||||
| External fallbacks | 0 | n/a | n/a |
|
||||
|
||||
## Latency
|
||||
|
||||
- Target end-to-end: count=6, avg=5212ms, p50=5091ms, p95=8949ms, min=1859ms, max=9381ms
|
||||
- Baseline end-to-end: n/a
|
||||
- P50 delta (target - baseline): n/a
|
||||
- P95 delta (target - baseline): n/a
|
||||
- Target external attempt: count=6, avg=5201ms, p50=5082ms, p95=8939ms, min=1848ms, max=9371ms
|
||||
|
||||
## Fallback Taxonomy
|
||||
|
||||
| Category | Count | Percent |
|
||||
| --- | ---: | ---: |
|
||||
| _none_ | 0 | 0.00% |
|
||||
|
||||
## Top Fallback Reasons
|
||||
|
||||
- none
|
||||
|
||||
## Gate Evaluation
|
||||
|
||||
- Gate result: HOLD
|
||||
- [ ] Completion rate delta (target - baseline): actual=n/a, threshold=>= -2.00pp
|
||||
- [ ] P50 latency delta (target - baseline): actual=n/a, threshold=<= 250ms
|
||||
- [ ] P95 latency delta (target - baseline): actual=n/a, threshold=<= 700ms
|
||||
- [x] Fallback rate (target external attempts): actual=0.00%, threshold=<= 5.00%
|
||||
|
||||
@@ -81,17 +81,19 @@ pnpm audit:backend-canary \
|
||||
|
||||
### Window B
|
||||
|
||||
- Dates: _TBD_
|
||||
- Route volume: _TBD_
|
||||
- Summary artifact: _TBD_
|
||||
- Dates: February 24, 2026 (since 06:14:00Z; post-initial-fallback slice)
|
||||
- Route volume: 6 total routes (`pi_embedded`: 6, `native`: 0)
|
||||
- Summary artifacts:
|
||||
- `docs/plans/artifacts/pi_embedded_eval_window_b_2026-02-24_post_fallbacks.md`
|
||||
- `docs/plans/artifacts/pi_embedded_eval_window_b_2026-02-24_post_fallbacks.json`
|
||||
|
||||
| Check | Result | Notes |
|
||||
| --- | --- | --- |
|
||||
| Completion rate delta | _TBD_ | |
|
||||
| P50 latency delta | _TBD_ | |
|
||||
| P95 latency delta | _TBD_ | |
|
||||
| Fallback rate | _TBD_ | |
|
||||
| Guardrail escapes | _TBD_ | |
|
||||
| Completion rate delta | n/a (insufficient baseline) | no native-routed turns in this slice |
|
||||
| P50 latency delta | n/a (insufficient baseline) | no native-routed turns in this slice |
|
||||
| P95 latency delta | n/a (insufficient baseline) | no native-routed turns in this slice |
|
||||
| Fallback rate | 0.00% (pass) | 0 fallbacks / 6 attempts |
|
||||
| Guardrail escapes | none observed (provisional pass) | no `forced_native_guard` events in this window |
|
||||
|
||||
## Tool Compatibility Findings
|
||||
|
||||
@@ -110,7 +112,8 @@ Track all tool-adjacent/risky prompts that were force-routed to native (`no_tool
|
||||
- Rationale: Window A fails 3/4 numeric gates (p50 delta, p95 delta, fallback rate) with only 10 total routed turns, including two concrete fallback failure modes:
|
||||
- module session factory mismatch
|
||||
- no assistant text returned from Pi runtime
|
||||
- Next cohort/config delta: none until Window B confirms gate pass and fallback causes are remediated.
|
||||
Window B shows fallback recovery (0%) in a post-fallback slice but cannot evaluate delta gates because it contains no baseline native routes.
|
||||
- Next cohort/config delta: none until an additional baseline-balanced window confirms delta gates and guardrail coverage probes are completed.
|
||||
|
||||
## Diagram/Protocol Impact Review
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
"status": "in_progress",
|
||||
"date": "2026-02-24",
|
||||
"updated": "2026-02-24",
|
||||
"summary": "Formal Pi embedded canary evaluation is active with audit-log summarization and fixed rollout gates. Window A (2026-02-24) was captured for `telegram:8367012007` and resulted in `hold` due to fallback rate and latency deltas exceeding thresholds; expansion remains blocked pending Window B and fallback remediation.",
|
||||
"summary": "Formal Pi embedded canary evaluation is active with audit-log summarization and fixed rollout gates. Window A (2026-02-24) produced `hold` due to fallback rate and latency deltas above thresholds. Window B (post-fallback slice) shows fallback recovery (0%) but remains baseline-insufficient for delta-gate pass/fail, so expansion is still blocked.",
|
||||
"files_modified": [
|
||||
"src/audit/backendCanarySummary.ts",
|
||||
"src/audit/backendCanarySummary.test.ts",
|
||||
@@ -18,12 +18,14 @@
|
||||
"docs/plans/artifacts/.gitkeep",
|
||||
"docs/plans/artifacts/pi_embedded_eval_window_a_2026-02-24.md",
|
||||
"docs/plans/artifacts/pi_embedded_eval_window_a_2026-02-24.json",
|
||||
"docs/plans/artifacts/pi_embedded_eval_window_b_2026-02-24_post_fallbacks.md",
|
||||
"docs/plans/artifacts/pi_embedded_eval_window_b_2026-02-24_post_fallbacks.json",
|
||||
"docs/architecture/AGENT_DIAGRAM.md",
|
||||
"docs/architecture/GATEWAY_SESSIONS_AND_QUEUE.md",
|
||||
"docs/api/PROTOCOL.md",
|
||||
"docs/plans/state.json"
|
||||
],
|
||||
"test_status": "pnpm test:run src/audit/backendCanarySummary.test.ts + pnpm typecheck + pnpm audit:backend-canary against ~/.local/share/flynn/audit.log (Window A artifacts generated) passing; pnpm lint unchanged warnings-only baseline"
|
||||
"test_status": "pnpm test:run src/audit/backendCanarySummary.test.ts + pnpm typecheck + pnpm audit:backend-canary against ~/.local/share/flynn/audit.log (Window A + Window B artifacts generated) passing; pnpm lint unchanged warnings-only baseline"
|
||||
},
|
||||
"pi-embedded-backend-canary-spike": {
|
||||
"status": "completed",
|
||||
@@ -6484,7 +6486,7 @@
|
||||
"remaining_phases_completion": "Phase 1: 3/3 (100%) — context levels, command registry, memory structure. Phase 2: 3/3 (100%) — component registry, confidence routing, history index. Phase 3: 2/2 (100%) — adaptive memory/compaction, truthfulness/autonomy hardening",
|
||||
"next_up": "Track OpenClaw evolution regularly for inspiration and feature ideas",
|
||||
"pi_embedded_canary_spike": "completed — added optional pi_embedded backend adapter, canary-safe no-tools routing guard, backend success/fallback latency audit events, and docs/diagram updates while native remains default",
|
||||
"pi_embedded_evaluation_phase": "in progress — Window A recorded (10 routes, gate HOLD: p50 +259ms, p95 +5695ms, fallback 25%); no cohort expansion until Window B + fallback remediation"
|
||||
"pi_embedded_evaluation_phase": "in progress — Window A gate HOLD (p50 +259ms, p95 +5695ms, fallback 25%); Window B post-fallback slice has 0% fallback but insufficient native baseline for delta gates; no cohort expansion"
|
||||
},
|
||||
"soul_md_and_cron_create": {
|
||||
"date": "2026-02-11",
|
||||
|
||||
Reference in New Issue
Block a user