docs(eval): record guard-coverage preprobe window and updated gates
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"generated_at": "2026-02-24T06:33:31.111Z",
|
||||
"generated_at": "2026-02-24T06:37:06.279Z",
|
||||
"event_count": 45,
|
||||
"filters": {},
|
||||
"options": {
|
||||
|
||||
@@ -11,6 +11,12 @@
|
||||
| pi_embedded | 8 |
|
||||
| native | 2 |
|
||||
|
||||
### Forced Native Guards
|
||||
|
||||
| Guard reason | Count |
|
||||
| --- | ---: |
|
||||
| _none_ | 0 |
|
||||
|
||||
## Reliability
|
||||
|
||||
| Metric | Target | Baseline | Delta |
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"generated_at": "2026-02-24T06:33:37.931Z",
|
||||
"generated_at": "2026-02-24T06:37:06.252Z",
|
||||
"event_count": 24,
|
||||
"filters": {
|
||||
"since_ms": 1771913640000
|
||||
|
||||
@@ -10,6 +10,12 @@
|
||||
| --- | ---: |
|
||||
| pi_embedded | 6 |
|
||||
|
||||
### Forced Native Guards
|
||||
|
||||
| Guard reason | Count |
|
||||
| --- | ---: |
|
||||
| _none_ | 0 |
|
||||
|
||||
## Reliability
|
||||
|
||||
| Metric | Target | Baseline | Delta |
|
||||
|
||||
@@ -0,0 +1,162 @@
|
||||
{
|
||||
"generated_at": "2026-02-24T06:37:21.443Z",
|
||||
"event_count": 45,
|
||||
"filters": {},
|
||||
"options": {
|
||||
"targetBackend": "pi_embedded",
|
||||
"baselineBackend": "native",
|
||||
"sessionIds": [
|
||||
"telegram:8367012007"
|
||||
]
|
||||
},
|
||||
"summary": {
|
||||
"route_stats": {
|
||||
"total": 10,
|
||||
"by_backend": {
|
||||
"native": 2,
|
||||
"pi_embedded": 8
|
||||
},
|
||||
"by_source": {
|
||||
"agent_override": 10
|
||||
},
|
||||
"forced_native_guards": {}
|
||||
},
|
||||
"target": {
|
||||
"backend": "pi_embedded",
|
||||
"routes": 8,
|
||||
"completed_turns": 8,
|
||||
"incomplete_turns": 0,
|
||||
"completion_rate_pct": 100,
|
||||
"e2e_latency_ms": {
|
||||
"count": 8,
|
||||
"avg_ms": 4615,
|
||||
"p50_ms": 3240,
|
||||
"p95_ms": 8776,
|
||||
"min_ms": 1859,
|
||||
"max_ms": 9381
|
||||
}
|
||||
},
|
||||
"baseline": {
|
||||
"backend": "native",
|
||||
"routes": 2,
|
||||
"completed_turns": 2,
|
||||
"incomplete_turns": 0,
|
||||
"completion_rate_pct": 100,
|
||||
"e2e_latency_ms": {
|
||||
"count": 2,
|
||||
"avg_ms": 2981,
|
||||
"p50_ms": 2981,
|
||||
"p95_ms": 3081,
|
||||
"min_ms": 2870,
|
||||
"max_ms": 3092
|
||||
}
|
||||
},
|
||||
"target_external_attempts": {
|
||||
"attempts": 8,
|
||||
"successes": 6,
|
||||
"fallbacks": 2,
|
||||
"unresolved_attempts": 0,
|
||||
"success_rate_pct": 75,
|
||||
"attempt_latency_ms": {
|
||||
"count": 8,
|
||||
"avg_ms": 3961,
|
||||
"p50_ms": 2636,
|
||||
"p95_ms": 8766,
|
||||
"min_ms": 135,
|
||||
"max_ms": 9371
|
||||
}
|
||||
},
|
||||
"comparison": {
|
||||
"completion_rate_delta_pp": 0,
|
||||
"p50_latency_delta_ms": 259,
|
||||
"p95_latency_delta_ms": 5695
|
||||
},
|
||||
"fallback_categories": [
|
||||
{
|
||||
"category": "empty_assistant_text",
|
||||
"count": 1,
|
||||
"pct": 50
|
||||
},
|
||||
{
|
||||
"category": "pi_module_interface",
|
||||
"count": 1,
|
||||
"pct": 50
|
||||
}
|
||||
],
|
||||
"fallback_top_reasons": [
|
||||
{
|
||||
"reason": "Loaded Pi module does not expose a supported session factory (expected one of: createAgentSession, createSession, createPiSession, createAge",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"reason": "Pi Agent runtime produced no assistant text",
|
||||
"count": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"gate": {
|
||||
"pass": false,
|
||||
"criteria": [
|
||||
{
|
||||
"criterion": "Minimum target routes",
|
||||
"pass": true,
|
||||
"actual": "8",
|
||||
"threshold": ">= 8"
|
||||
},
|
||||
{
|
||||
"criterion": "Minimum baseline routes",
|
||||
"pass": true,
|
||||
"actual": "2",
|
||||
"threshold": ">= 2"
|
||||
},
|
||||
{
|
||||
"criterion": "Minimum target external attempts",
|
||||
"pass": true,
|
||||
"actual": "8",
|
||||
"threshold": ">= 8"
|
||||
},
|
||||
{
|
||||
"criterion": "Minimum pi_no_tools_mode guard hits",
|
||||
"pass": false,
|
||||
"actual": "0",
|
||||
"threshold": ">= 1"
|
||||
},
|
||||
{
|
||||
"criterion": "Minimum capability_query guard hits",
|
||||
"pass": false,
|
||||
"actual": "0",
|
||||
"threshold": ">= 1"
|
||||
},
|
||||
{
|
||||
"criterion": "Minimum attachments_present guard hits",
|
||||
"pass": false,
|
||||
"actual": "0",
|
||||
"threshold": ">= 1"
|
||||
},
|
||||
{
|
||||
"criterion": "Completion rate delta (target - baseline)",
|
||||
"pass": true,
|
||||
"actual": "0.00pp",
|
||||
"threshold": ">= -2.00pp"
|
||||
},
|
||||
{
|
||||
"criterion": "P50 latency delta (target - baseline)",
|
||||
"pass": false,
|
||||
"actual": "259ms",
|
||||
"threshold": "<= 250ms"
|
||||
},
|
||||
{
|
||||
"criterion": "P95 latency delta (target - baseline)",
|
||||
"pass": false,
|
||||
"actual": "5695ms",
|
||||
"threshold": "<= 700ms"
|
||||
},
|
||||
{
|
||||
"criterion": "Fallback rate (target external attempts)",
|
||||
"pass": false,
|
||||
"actual": "25.00%",
|
||||
"threshold": "<= 5.00%"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,62 @@
|
||||
# Pi Embedded Canary Summary
|
||||
|
||||
- Target backend: `pi_embedded`
|
||||
- Baseline backend: `native`
|
||||
- Routes analyzed: 10
|
||||
|
||||
## Route Distribution
|
||||
|
||||
| Backend | Routes |
|
||||
| --- | ---: |
|
||||
| pi_embedded | 8 |
|
||||
| native | 2 |
|
||||
|
||||
### Forced Native Guards
|
||||
|
||||
| Guard reason | Count |
|
||||
| --- | ---: |
|
||||
| _none_ | 0 |
|
||||
|
||||
## Reliability
|
||||
|
||||
| Metric | Target | Baseline | Delta |
|
||||
| --- | ---: | ---: | ---: |
|
||||
| Turn completion rate | 100.00% | 100.00% | 0.00pp |
|
||||
| External success rate | 75.00% | n/a | n/a |
|
||||
| External attempts | 8 | n/a | n/a |
|
||||
| External fallbacks | 2 | n/a | n/a |
|
||||
|
||||
## Latency
|
||||
|
||||
- Target end-to-end: count=8, avg=4615ms, p50=3240ms, p95=8776ms, min=1859ms, max=9381ms
|
||||
- Baseline end-to-end: count=2, avg=2981ms, p50=2981ms, p95=3081ms, min=2870ms, max=3092ms
|
||||
- P50 delta (target - baseline): 259ms
|
||||
- P95 delta (target - baseline): 5695ms
|
||||
- Target external attempt: count=8, avg=3961ms, p50=2636ms, p95=8766ms, min=135ms, max=9371ms
|
||||
|
||||
## Fallback Taxonomy
|
||||
|
||||
| Category | Count | Percent |
|
||||
| --- | ---: | ---: |
|
||||
| empty_assistant_text | 1 | 50.00% |
|
||||
| pi_module_interface | 1 | 50.00% |
|
||||
|
||||
## Top Fallback Reasons
|
||||
|
||||
- Loaded Pi module does not expose a supported session factory (expected one of: createAgentSession, createSession, createPiSession, createAge (1)
|
||||
- Pi Agent runtime produced no assistant text (1)
|
||||
|
||||
## Gate Evaluation
|
||||
|
||||
- Gate result: HOLD
|
||||
- [x] Minimum target routes: actual=8, threshold=>= 8
|
||||
- [x] Minimum baseline routes: actual=2, threshold=>= 2
|
||||
- [x] Minimum target external attempts: actual=8, threshold=>= 8
|
||||
- [ ] Minimum pi_no_tools_mode guard hits: actual=0, threshold=>= 1
|
||||
- [ ] Minimum capability_query guard hits: actual=0, threshold=>= 1
|
||||
- [ ] Minimum attachments_present guard hits: actual=0, threshold=>= 1
|
||||
- [x] Completion rate delta (target - baseline): actual=0.00pp, threshold=>= -2.00pp
|
||||
- [ ] P50 latency delta (target - baseline): actual=259ms, threshold=<= 250ms
|
||||
- [ ] P95 latency delta (target - baseline): actual=5695ms, threshold=<= 700ms
|
||||
- [ ] Fallback rate (target external attempts): actual=25.00%, threshold=<= 5.00%
|
||||
|
||||
@@ -24,6 +24,7 @@ Use the same thresholds for every evaluation window.
|
||||
| Minimum target routes | >= 8 |
|
||||
| Minimum baseline routes | >= 2 |
|
||||
| Minimum target external attempts | >= 8 |
|
||||
| Minimum guard coverage (probe window) | `pi_no_tools_mode >= 1`, `capability_query >= 1`, `attachments_present >= 1` |
|
||||
| Completion rate delta (target - baseline) | >= -2.00pp |
|
||||
| P50 latency delta (target - baseline) | <= +250ms |
|
||||
| P95 latency delta (target - baseline) | <= +700ms |
|
||||
@@ -34,6 +35,7 @@ Notes:
|
||||
- Completion rate and latency are computed from route-to-assistant turn timings.
|
||||
- Fallback rate is computed from `backend.success` + `backend.fallback` attempt outcomes.
|
||||
- Guardrail escapes are reviewed from `backend.route.source == forced_native_guard` + operator incident review.
|
||||
- Guard-coverage minimums are enforced for controlled probe windows, not passive traffic slices.
|
||||
|
||||
## How To Run
|
||||
|
||||
@@ -70,6 +72,27 @@ pnpm audit:backend-canary \
|
||||
--gate-max-fallback-rate-pct 5
|
||||
```
|
||||
|
||||
Run controlled probe-window evaluation (guard coverage required):
|
||||
|
||||
```bash
|
||||
pnpm audit:backend-canary \
|
||||
--audit ~/.local/share/flynn/audit.log \
|
||||
--backend pi_embedded \
|
||||
--baseline native \
|
||||
--session telegram:8367012007 \
|
||||
--gate-min-target-routes 8 \
|
||||
--gate-min-baseline-routes 2 \
|
||||
--gate-min-target-attempts 8 \
|
||||
--gate-min-guard-pi-no-tools-count 1 \
|
||||
--gate-min-guard-capability-query-count 1 \
|
||||
--gate-min-guard-attachments-present-count 1 \
|
||||
--gate-max-completion-drop-pp 2 \
|
||||
--gate-max-p50-latency-increase-ms 250 \
|
||||
--gate-max-p95-latency-increase-ms 700 \
|
||||
--gate-max-fallback-rate-pct 5 \
|
||||
--format markdown
|
||||
```
|
||||
|
||||
## Evaluation Log
|
||||
|
||||
### Window A
|
||||
@@ -110,6 +133,27 @@ pnpm audit:backend-canary \
|
||||
| Fallback rate | 0.00% (pass) | 0 fallbacks / 6 attempts |
|
||||
| Guardrail escapes | none observed (provisional pass) | no `forced_native_guard` events in this window |
|
||||
|
||||
### Window C (Guard Coverage Pre-Probe Baseline)
|
||||
|
||||
- Dates: February 24, 2026 (same full Window A slice; guard-coverage gates enabled)
|
||||
- Route volume: 10 total routes (`pi_embedded`: 8, `native`: 2)
|
||||
- Summary artifacts:
|
||||
- `docs/plans/artifacts/pi_embedded_eval_window_c_2026-02-24_guard_preprobe.md`
|
||||
- `docs/plans/artifacts/pi_embedded_eval_window_c_2026-02-24_guard_preprobe.json`
|
||||
|
||||
| Check | Result | Notes |
|
||||
| --- | --- | --- |
|
||||
| Minimum target routes | 8 (pass) | gate >= 8 |
|
||||
| Minimum baseline routes | 2 (pass) | gate >= 2 |
|
||||
| Minimum target external attempts | 8 (pass) | gate >= 8 |
|
||||
| Minimum `pi_no_tools_mode` guard hits | 0 (fail) | gate >= 1 |
|
||||
| Minimum `capability_query` guard hits | 0 (fail) | gate >= 1 |
|
||||
| Minimum `attachments_present` guard hits | 0 (fail) | gate >= 1 |
|
||||
| Completion rate delta | 0.00pp (pass) | target 100.00% vs baseline 100.00% |
|
||||
| P50 latency delta | +259ms (fail) | gate <= +250ms |
|
||||
| P95 latency delta | +5695ms (fail) | gate <= +700ms |
|
||||
| Fallback rate | 25.00% (fail) | 2 fallbacks / 8 attempts; gate <= 5.00% |
|
||||
|
||||
## Tool Compatibility Findings
|
||||
|
||||
Track all tool-adjacent/risky prompts that were force-routed to native (`no_tools_mode`) and any misses.
|
||||
@@ -125,6 +169,7 @@ Track all tool-adjacent/risky prompts that were force-routed to native (`no_tool
|
||||
- Decision date: February 24, 2026
|
||||
- Decision: `hold` (no cohort expansion yet)
|
||||
- Rationale: Window A fails 3/4 numeric gates (p50 delta, p95 delta, fallback rate) with only 10 total routed turns, including two concrete fallback failure modes:
|
||||
and Window C pre-probe baseline confirms missing guard-coverage evidence (`pi_no_tools_mode`, `capability_query`, `attachments_present` all at 0).
|
||||
- `pi_module_interface`
|
||||
- `empty_assistant_text`
|
||||
Window B shows fallback recovery (0%) in a post-fallback slice but fails minimum sample thresholds and has no native baseline routes for delta-gate evaluation.
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
"status": "in_progress",
|
||||
"date": "2026-02-24",
|
||||
"updated": "2026-02-24",
|
||||
"summary": "Formal Pi embedded canary evaluation is active with audit-log summarization, minimum-sample gate thresholds, and normalized Pi-specific fallback categories. Window A (2026-02-24) remains `hold` due to latency/fallback failures; Window B shows fallback recovery (0%) but fails minimum-sample/baseline requirements, so expansion remains blocked.",
|
||||
"summary": "Formal Pi embedded canary evaluation is active with audit-log summarization, minimum-sample thresholds, and guard-coverage thresholds for controlled probe windows. Window A remains `hold` due to latency/fallback failures; Window B shows fallback recovery (0%) but fails sample/baseline minimums; Window C pre-probe baseline confirms guard-coverage evidence is still missing, so expansion remains blocked.",
|
||||
"files_modified": [
|
||||
"src/audit/backendCanarySummary.ts",
|
||||
"src/audit/backendCanarySummary.test.ts",
|
||||
@@ -20,12 +20,14 @@
|
||||
"docs/plans/artifacts/pi_embedded_eval_window_a_2026-02-24.json",
|
||||
"docs/plans/artifacts/pi_embedded_eval_window_b_2026-02-24_post_fallbacks.md",
|
||||
"docs/plans/artifacts/pi_embedded_eval_window_b_2026-02-24_post_fallbacks.json",
|
||||
"docs/plans/artifacts/pi_embedded_eval_window_c_2026-02-24_guard_preprobe.md",
|
||||
"docs/plans/artifacts/pi_embedded_eval_window_c_2026-02-24_guard_preprobe.json",
|
||||
"docs/architecture/AGENT_DIAGRAM.md",
|
||||
"docs/architecture/GATEWAY_SESSIONS_AND_QUEUE.md",
|
||||
"docs/api/PROTOCOL.md",
|
||||
"docs/plans/state.json"
|
||||
],
|
||||
"test_status": "pnpm test:run src/audit/backendCanarySummary.test.ts + pnpm typecheck + pnpm audit:backend-canary against ~/.local/share/flynn/audit.log with minimum-sample gates (Window A + Window B artifacts regenerated) passing; pnpm lint unchanged warnings-only baseline"
|
||||
"test_status": "pnpm test:run src/audit/backendCanarySummary.test.ts + pnpm typecheck + pnpm audit:backend-canary against ~/.local/share/flynn/audit.log with minimum-sample + guard-coverage gates (Window A/B regenerated, Window C pre-probe generated) passing; pnpm lint unchanged warnings-only baseline"
|
||||
},
|
||||
"pi-embedded-backend-canary-spike": {
|
||||
"status": "completed",
|
||||
@@ -6486,7 +6488,7 @@
|
||||
"remaining_phases_completion": "Phase 1: 3/3 (100%) — context levels, command registry, memory structure. Phase 2: 3/3 (100%) — component registry, confidence routing, history index. Phase 3: 2/2 (100%) — adaptive memory/compaction, truthfulness/autonomy hardening",
|
||||
"next_up": "Track OpenClaw evolution regularly for inspiration and feature ideas",
|
||||
"pi_embedded_canary_spike": "completed — added optional pi_embedded backend adapter, canary-safe no-tools routing guard, backend success/fallback latency audit events, and docs/diagram updates while native remains default",
|
||||
"pi_embedded_evaluation_phase": "in progress — minimum-sample gates enforced; Window A HOLD (p50 +259ms, p95 +5695ms, fallback 25%, categories: pi_module_interface/empty_assistant_text); Window B has 0% fallback but fails sample/baseline gates; no cohort expansion"
|
||||
"pi_embedded_evaluation_phase": "in progress — minimum-sample and guard-coverage gates enforced; Window A HOLD (p50 +259ms, p95 +5695ms, fallback 25%, categories: pi_module_interface/empty_assistant_text); Window B has 0% fallback but fails sample/baseline gates; Window C pre-probe shows 0 guard hits across pi_no_tools_mode/capability_query/attachments_present; no cohort expansion"
|
||||
},
|
||||
"soul_md_and_cron_create": {
|
||||
"date": "2026-02-11",
|
||||
|
||||
Reference in New Issue
Block a user