docs(eval): close pi canary phase with rollback decision and probe evidence
This commit is contained in:
@@ -0,0 +1,105 @@
|
||||
{
|
||||
"generated_at": "2026-02-24T06:44:04.629Z",
|
||||
"event_count": 11,
|
||||
"filters": {},
|
||||
"options": {
|
||||
"targetBackend": "pi_embedded",
|
||||
"baselineBackend": "native",
|
||||
"sessionIds": [
|
||||
"telegram:8367012007"
|
||||
]
|
||||
},
|
||||
"summary": {
|
||||
"route_stats": {
|
||||
"total": 4,
|
||||
"by_backend": {
|
||||
"pi_embedded": 1,
|
||||
"native": 3
|
||||
},
|
||||
"by_source": {
|
||||
"default_external": 1,
|
||||
"forced_native_guard": 3
|
||||
},
|
||||
"forced_native_guards": {
|
||||
"pi_no_tools_mode": 1,
|
||||
"capability_query": 1,
|
||||
"attachments_present": 1
|
||||
}
|
||||
},
|
||||
"target": {
|
||||
"backend": "pi_embedded",
|
||||
"routes": 1,
|
||||
"completed_turns": 1,
|
||||
"incomplete_turns": 0,
|
||||
"completion_rate_pct": 100,
|
||||
"e2e_latency_ms": {
|
||||
"count": 1,
|
||||
"avg_ms": 0,
|
||||
"p50_ms": 0,
|
||||
"p95_ms": 0,
|
||||
"min_ms": 0,
|
||||
"max_ms": 0
|
||||
}
|
||||
},
|
||||
"baseline": {
|
||||
"backend": "native",
|
||||
"routes": 3,
|
||||
"completed_turns": 3,
|
||||
"incomplete_turns": 0,
|
||||
"completion_rate_pct": 100,
|
||||
"e2e_latency_ms": {
|
||||
"count": 3,
|
||||
"avg_ms": 0,
|
||||
"p50_ms": 0,
|
||||
"p95_ms": 0,
|
||||
"min_ms": 0,
|
||||
"max_ms": 0
|
||||
}
|
||||
},
|
||||
"target_external_attempts": {
|
||||
"attempts": 1,
|
||||
"successes": 1,
|
||||
"fallbacks": 0,
|
||||
"unresolved_attempts": 0,
|
||||
"success_rate_pct": 100,
|
||||
"attempt_latency_ms": {
|
||||
"count": 1,
|
||||
"avg_ms": 0,
|
||||
"p50_ms": 0,
|
||||
"p95_ms": 0,
|
||||
"min_ms": 0,
|
||||
"max_ms": 0
|
||||
}
|
||||
},
|
||||
"comparison": {
|
||||
"completion_rate_delta_pp": 0,
|
||||
"p50_latency_delta_ms": 0,
|
||||
"p95_latency_delta_ms": 0
|
||||
},
|
||||
"fallback_categories": [],
|
||||
"fallback_top_reasons": []
|
||||
},
|
||||
"gate": {
|
||||
"pass": true,
|
||||
"criteria": [
|
||||
{
|
||||
"criterion": "Minimum pi_no_tools_mode guard hits",
|
||||
"pass": true,
|
||||
"actual": "1",
|
||||
"threshold": ">= 1"
|
||||
},
|
||||
{
|
||||
"criterion": "Minimum capability_query guard hits",
|
||||
"pass": true,
|
||||
"actual": "1",
|
||||
"threshold": ">= 1"
|
||||
},
|
||||
{
|
||||
"criterion": "Minimum attachments_present guard hits",
|
||||
"pass": true,
|
||||
"actual": "1",
|
||||
"threshold": ">= 1"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
# Pi Embedded Canary Summary
|
||||
|
||||
- Target backend: `pi_embedded`
|
||||
- Baseline backend: `native`
|
||||
- Routes analyzed: 4
|
||||
|
||||
## Route Distribution
|
||||
|
||||
| Backend | Routes |
|
||||
| --- | ---: |
|
||||
| native | 3 |
|
||||
| pi_embedded | 1 |
|
||||
|
||||
### Forced Native Guards
|
||||
|
||||
| Guard reason | Count |
|
||||
| --- | ---: |
|
||||
| attachments_present | 1 |
|
||||
| capability_query | 1 |
|
||||
| pi_no_tools_mode | 1 |
|
||||
|
||||
## Reliability
|
||||
|
||||
| Metric | Target | Baseline | Delta |
|
||||
| --- | ---: | ---: | ---: |
|
||||
| Turn completion rate | 100.00% | 100.00% | 0.00pp |
|
||||
| External success rate | 100.00% | n/a | n/a |
|
||||
| External attempts | 1 | n/a | n/a |
|
||||
| External fallbacks | 0 | n/a | n/a |
|
||||
|
||||
## Latency
|
||||
|
||||
- Target end-to-end: count=1, avg=0ms, p50=0ms, p95=0ms, min=0ms, max=0ms
|
||||
- Baseline end-to-end: count=3, avg=0ms, p50=0ms, p95=0ms, min=0ms, max=0ms
|
||||
- P50 delta (target - baseline): 0ms
|
||||
- P95 delta (target - baseline): 0ms
|
||||
- Target external attempt: count=1, avg=0ms, p50=0ms, p95=0ms, min=0ms, max=0ms
|
||||
|
||||
## Fallback Taxonomy
|
||||
|
||||
| Category | Count | Percent |
|
||||
| --- | ---: | ---: |
|
||||
| _none_ | 0 | 0.00% |
|
||||
|
||||
## Top Fallback Reasons
|
||||
|
||||
- none
|
||||
|
||||
## Gate Evaluation
|
||||
|
||||
- Gate result: PASS
|
||||
- [x] Minimum pi_no_tools_mode guard hits: actual=1, threshold=>= 1
|
||||
- [x] Minimum capability_query guard hits: actual=1, threshold=>= 1
|
||||
- [x] Minimum attachments_present guard hits: actual=1, threshold=>= 1
|
||||
|
||||
@@ -0,0 +1,15 @@
|
||||
{"timestamp":1771915435243,"level":"info","event_type":"user.action","event":{"session_id":"telegram:8367012007","channel":"telegram","sender":"8367012007","source":"channel","action_type":"message","content_length":23,"attachments_count":0}}
|
||||
{"timestamp":1771915435244,"level":"info","event_type":"backend.route","event":{"session_id":"telegram:8367012007","channel":"telegram","sender":"8367012007","selected_backend":"pi_embedded","source":"default_external"}}
|
||||
{"timestamp":1771915435244,"level":"debug","event_type":"session.message","event":{"session_id":"telegram:8367012007","role":"user","content_length":23}}
|
||||
{"timestamp":1771915435244,"level":"info","event_type":"backend.success","event":{"session_id":"telegram:8367012007","channel":"telegram","sender":"8367012007","backend":"pi_embedded","duration_ms":0,"response_length":26}}
|
||||
{"timestamp":1771915435244,"level":"debug","event_type":"session.message","event":{"session_id":"telegram:8367012007","role":"assistant","content_length":26}}
|
||||
{"timestamp":1771915435244,"level":"debug","event_type":"session.message","event":{"session_id":"telegram:8367012007","role":"assistant","content_length":26}}
|
||||
{"timestamp":1771915435244,"level":"info","event_type":"user.action","event":{"session_id":"telegram:8367012007","channel":"telegram","sender":"8367012007","source":"channel","action_type":"message","content_length":42,"attachments_count":0}}
|
||||
{"timestamp":1771915435244,"level":"info","event_type":"backend.route","event":{"session_id":"telegram:8367012007","channel":"telegram","sender":"8367012007","selected_backend":"native","source":"forced_native_guard","guard_reason":"pi_no_tools_mode"}}
|
||||
{"timestamp":1771915435244,"level":"debug","event_type":"session.message","event":{"session_id":"telegram:8367012007","role":"assistant","content_length":62}}
|
||||
{"timestamp":1771915435244,"level":"info","event_type":"user.action","event":{"session_id":"telegram:8367012007","channel":"telegram","sender":"8367012007","source":"channel","action_type":"message","content_length":25,"attachments_count":0}}
|
||||
{"timestamp":1771915435244,"level":"info","event_type":"backend.route","event":{"session_id":"telegram:8367012007","channel":"telegram","sender":"8367012007","selected_backend":"native","source":"forced_native_guard","guard_reason":"capability_query"}}
|
||||
{"timestamp":1771915435244,"level":"debug","event_type":"session.message","event":{"session_id":"telegram:8367012007","role":"assistant","content_length":47}}
|
||||
{"timestamp":1771915435244,"level":"info","event_type":"user.action","event":{"session_id":"telegram:8367012007","channel":"telegram","sender":"8367012007","source":"channel","action_type":"message","content_length":35,"attachments_count":1}}
|
||||
{"timestamp":1771915435244,"level":"info","event_type":"backend.route","event":{"session_id":"telegram:8367012007","channel":"telegram","sender":"8367012007","selected_backend":"native","source":"forced_native_guard","guard_reason":"attachments_present"}}
|
||||
{"timestamp":1771915435244,"level":"debug","event_type":"session.message","event":{"session_id":"telegram:8367012007","role":"assistant","content_length":57}}
|
||||
Reference in New Issue
Block a user