docs(eval): add window B telemetry slice and maintain hold decision

This commit is contained in:
William Valentin
2026-02-23 22:31:06 -08:00
parent 9156adb2a8
commit 2d31f85c75
4 changed files with 164 additions and 12 deletions
@@ -0,0 +1,100 @@
{
"generated_at": "2026-02-24T06:30:22.439Z",
"event_count": 24,
"filters": {
"since_ms": 1771913640000
},
"options": {
"targetBackend": "pi_embedded",
"baselineBackend": "native",
"sessionIds": [
"telegram:8367012007"
]
},
"summary": {
"route_stats": {
"total": 6,
"by_backend": {
"pi_embedded": 6
},
"by_source": {
"agent_override": 6
},
"forced_native_guards": {}
},
"target": {
"backend": "pi_embedded",
"routes": 6,
"completed_turns": 6,
"incomplete_turns": 0,
"completion_rate_pct": 100,
"e2e_latency_ms": {
"count": 6,
"avg_ms": 5212,
"p50_ms": 5091,
"p95_ms": 8949,
"min_ms": 1859,
"max_ms": 9381
}
},
"baseline": {
"backend": "native",
"routes": 0,
"completed_turns": 0,
"incomplete_turns": 0,
"completion_rate_pct": null,
"e2e_latency_ms": null
},
"target_external_attempts": {
"attempts": 6,
"successes": 6,
"fallbacks": 0,
"unresolved_attempts": 0,
"success_rate_pct": 100,
"attempt_latency_ms": {
"count": 6,
"avg_ms": 5201,
"p50_ms": 5082,
"p95_ms": 8939,
"min_ms": 1848,
"max_ms": 9371
}
},
"comparison": {
"completion_rate_delta_pp": null,
"p50_latency_delta_ms": null,
"p95_latency_delta_ms": null
},
"fallback_categories": [],
"fallback_top_reasons": []
},
"gate": {
"pass": false,
"criteria": [
{
"criterion": "Completion rate delta (target - baseline)",
"pass": false,
"actual": "n/a",
"threshold": ">= -2.00pp"
},
{
"criterion": "P50 latency delta (target - baseline)",
"pass": false,
"actual": "n/a",
"threshold": "<= 250ms"
},
{
"criterion": "P95 latency delta (target - baseline)",
"pass": false,
"actual": "n/a",
"threshold": "<= 700ms"
},
{
"criterion": "Fallback rate (target external attempts)",
"pass": true,
"actual": "0.00%",
"threshold": "<= 5.00%"
}
]
}
}
@@ -0,0 +1,47 @@
# Pi Embedded Canary Summary
- Target backend: `pi_embedded`
- Baseline backend: `native`
- Routes analyzed: 6
## Route Distribution
| Backend | Routes |
| --- | ---: |
| pi_embedded | 6 |
## Reliability
| Metric | Target | Baseline | Delta |
| --- | ---: | ---: | ---: |
| Turn completion rate | 100.00% | n/a | n/a |
| External success rate | 100.00% | n/a | n/a |
| External attempts | 6 | n/a | n/a |
| External fallbacks | 0 | n/a | n/a |
## Latency
- Target end-to-end: count=6, avg=5212ms, p50=5091ms, p95=8949ms, min=1859ms, max=9381ms
- Baseline end-to-end: n/a
- P50 delta (target - baseline): n/a
- P95 delta (target - baseline): n/a
- Target external attempt: count=6, avg=5201ms, p50=5082ms, p95=8939ms, min=1848ms, max=9371ms
## Fallback Taxonomy
| Category | Count | Percent |
| --- | ---: | ---: |
| _none_ | 0 | 0.00% |
## Top Fallback Reasons
- none
## Gate Evaluation
- Gate result: HOLD
- [ ] Completion rate delta (target - baseline): actual=n/a, threshold=>= -2.00pp
- [ ] P50 latency delta (target - baseline): actual=n/a, threshold=<= 250ms
- [ ] P95 latency delta (target - baseline): actual=n/a, threshold=<= 700ms
- [x] Fallback rate (target external attempts): actual=0.00%, threshold=<= 5.00%