docs(eval): record guard-coverage preprobe window and updated gates

This commit is contained in:
William Valentin
2026-02-23 22:38:18 -08:00
parent c5b310c852
commit 7b80c1e7a4
9 changed files with 291 additions and 5 deletions
@@ -1,5 +1,5 @@
{
"generated_at": "2026-02-24T06:33:31.111Z",
"generated_at": "2026-02-24T06:37:06.279Z",
"event_count": 45,
"filters": {},
"options": {
@@ -11,6 +11,12 @@
| pi_embedded | 8 |
| native | 2 |
### Forced Native Guards
| Guard reason | Count |
| --- | ---: |
| _none_ | 0 |
## Reliability
| Metric | Target | Baseline | Delta |
@@ -1,5 +1,5 @@
{
"generated_at": "2026-02-24T06:33:37.931Z",
"generated_at": "2026-02-24T06:37:06.252Z",
"event_count": 24,
"filters": {
"since_ms": 1771913640000
@@ -10,6 +10,12 @@
| --- | ---: |
| pi_embedded | 6 |
### Forced Native Guards
| Guard reason | Count |
| --- | ---: |
| _none_ | 0 |
## Reliability
| Metric | Target | Baseline | Delta |
@@ -0,0 +1,162 @@
{
"generated_at": "2026-02-24T06:37:21.443Z",
"event_count": 45,
"filters": {},
"options": {
"targetBackend": "pi_embedded",
"baselineBackend": "native",
"sessionIds": [
"telegram:8367012007"
]
},
"summary": {
"route_stats": {
"total": 10,
"by_backend": {
"native": 2,
"pi_embedded": 8
},
"by_source": {
"agent_override": 10
},
"forced_native_guards": {}
},
"target": {
"backend": "pi_embedded",
"routes": 8,
"completed_turns": 8,
"incomplete_turns": 0,
"completion_rate_pct": 100,
"e2e_latency_ms": {
"count": 8,
"avg_ms": 4615,
"p50_ms": 3240,
"p95_ms": 8776,
"min_ms": 1859,
"max_ms": 9381
}
},
"baseline": {
"backend": "native",
"routes": 2,
"completed_turns": 2,
"incomplete_turns": 0,
"completion_rate_pct": 100,
"e2e_latency_ms": {
"count": 2,
"avg_ms": 2981,
"p50_ms": 2981,
"p95_ms": 3081,
"min_ms": 2870,
"max_ms": 3092
}
},
"target_external_attempts": {
"attempts": 8,
"successes": 6,
"fallbacks": 2,
"unresolved_attempts": 0,
"success_rate_pct": 75,
"attempt_latency_ms": {
"count": 8,
"avg_ms": 3961,
"p50_ms": 2636,
"p95_ms": 8766,
"min_ms": 135,
"max_ms": 9371
}
},
"comparison": {
"completion_rate_delta_pp": 0,
"p50_latency_delta_ms": 259,
"p95_latency_delta_ms": 5695
},
"fallback_categories": [
{
"category": "empty_assistant_text",
"count": 1,
"pct": 50
},
{
"category": "pi_module_interface",
"count": 1,
"pct": 50
}
],
"fallback_top_reasons": [
{
"reason": "Loaded Pi module does not expose a supported session factory (expected one of: createAgentSession, createSession, createPiSession, createAge",
"count": 1
},
{
"reason": "Pi Agent runtime produced no assistant text",
"count": 1
}
]
},
"gate": {
"pass": false,
"criteria": [
{
"criterion": "Minimum target routes",
"pass": true,
"actual": "8",
"threshold": ">= 8"
},
{
"criterion": "Minimum baseline routes",
"pass": true,
"actual": "2",
"threshold": ">= 2"
},
{
"criterion": "Minimum target external attempts",
"pass": true,
"actual": "8",
"threshold": ">= 8"
},
{
"criterion": "Minimum pi_no_tools_mode guard hits",
"pass": false,
"actual": "0",
"threshold": ">= 1"
},
{
"criterion": "Minimum capability_query guard hits",
"pass": false,
"actual": "0",
"threshold": ">= 1"
},
{
"criterion": "Minimum attachments_present guard hits",
"pass": false,
"actual": "0",
"threshold": ">= 1"
},
{
"criterion": "Completion rate delta (target - baseline)",
"pass": true,
"actual": "0.00pp",
"threshold": ">= -2.00pp"
},
{
"criterion": "P50 latency delta (target - baseline)",
"pass": false,
"actual": "259ms",
"threshold": "<= 250ms"
},
{
"criterion": "P95 latency delta (target - baseline)",
"pass": false,
"actual": "5695ms",
"threshold": "<= 700ms"
},
{
"criterion": "Fallback rate (target external attempts)",
"pass": false,
"actual": "25.00%",
"threshold": "<= 5.00%"
}
]
}
}
@@ -0,0 +1,62 @@
# Pi Embedded Canary Summary
- Target backend: `pi_embedded`
- Baseline backend: `native`
- Routes analyzed: 10
## Route Distribution
| Backend | Routes |
| --- | ---: |
| pi_embedded | 8 |
| native | 2 |
### Forced Native Guards
| Guard reason | Count |
| --- | ---: |
| _none_ | 0 |
## Reliability
| Metric | Target | Baseline | Delta |
| --- | ---: | ---: | ---: |
| Turn completion rate | 100.00% | 100.00% | 0.00pp |
| External success rate | 75.00% | n/a | n/a |
| External attempts | 8 | n/a | n/a |
| External fallbacks | 2 | n/a | n/a |
## Latency
- Target end-to-end: count=8, avg=4615ms, p50=3240ms, p95=8776ms, min=1859ms, max=9381ms
- Baseline end-to-end: count=2, avg=2981ms, p50=2981ms, p95=3081ms, min=2870ms, max=3092ms
- P50 delta (target - baseline): 259ms
- P95 delta (target - baseline): 5695ms
- Target external attempt: count=8, avg=3961ms, p50=2636ms, p95=8766ms, min=135ms, max=9371ms
## Fallback Taxonomy
| Category | Count | Percent |
| --- | ---: | ---: |
| empty_assistant_text | 1 | 50.00% |
| pi_module_interface | 1 | 50.00% |
## Top Fallback Reasons
- Loaded Pi module does not expose a supported session factory (expected one of: createAgentSession, createSession, createPiSession, createAge (1)
- Pi Agent runtime produced no assistant text (1)
## Gate Evaluation
- Gate result: HOLD
- [x] Minimum target routes: actual=8, threshold=>= 8
- [x] Minimum baseline routes: actual=2, threshold=>= 2
- [x] Minimum target external attempts: actual=8, threshold=>= 8
- [ ] Minimum pi_no_tools_mode guard hits: actual=0, threshold=>= 1
- [ ] Minimum capability_query guard hits: actual=0, threshold=>= 1
- [ ] Minimum attachments_present guard hits: actual=0, threshold=>= 1
- [x] Completion rate delta (target - baseline): actual=0.00pp, threshold=>= -2.00pp
- [ ] P50 latency delta (target - baseline): actual=259ms, threshold=<= 250ms
- [ ] P95 latency delta (target - baseline): actual=5695ms, threshold=<= 700ms
- [ ] Fallback rate (target external attempts): actual=25.00%, threshold=<= 5.00%